From d1b26a29355aa0f144563d8b885b51a595eb2f9e Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 12 Sep 2024 20:52:27 +0000
Subject: [PATCH 01/17] builder_support_for_chatglm_1

---
 src/python/py/models/ChatGLM_modules.log | 128 ++++++++++++
 src/python/py/models/builder.py          | 236 ++++++++++++++++++++++-
 2 files changed, 359 insertions(+), 5 deletions(-)
 create mode 100644 src/python/py/models/ChatGLM_modules.log

diff --git a/src/python/py/models/ChatGLM_modules.log b/src/python/py/models/ChatGLM_modules.log
new file mode 100644
index 000000000..03732df1b
--- /dev/null
+++ b/src/python/py/models/ChatGLM_modules.log
@@ -0,0 +1,128 @@
+Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, INT4 CPU, INT4 CUDA, INT4 DML
+Extra options: {}
+GroupQueryAttention (GQA) is used in this model.
+##########
+ChatGLMForConditionalGeneration(
+  (transformer): ChatGLMModel(
+    (embedding): Embedding(
+      (word_embeddings): Embedding(65024, 4096)
+    )
+    (rotary_pos_emb): RotaryEmbedding()
+    (encoder): GLMTransformer(
+      (layers): ModuleList(
+        (0-27): 28 x GLMBlock(
+          (input_layernorm): RMSNorm()
+          (self_attention): SelfAttention(
+            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
+            (core_attention): CoreAttention(
+              (attention_dropout): Dropout(p=0.0, inplace=False)
+            )
+            (dense): Linear(in_features=4096, out_features=4096, bias=False)
+          )
+          (post_attention_layernorm): RMSNorm()
+          (mlp): MLP(
+            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
+            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
+          )
+        )
+      )
+      (final_layernorm): RMSNorm()
+    )
+    (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
+  )
+)
+##########
+ChatGLMModel(
+  (embedding): Embedding(
+    (word_embeddings): Embedding(65024, 4096)
+  )
+  (rotary_pos_emb): RotaryEmbedding()
+  (encoder): GLMTransformer(
+    (layers): ModuleList(
+      (0-27): 28 x GLMBlock(
+        (input_layernorm): RMSNorm()
+        (self_attention): SelfAttention(
+          (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
+          (core_attention): CoreAttention(
+            (attention_dropout): Dropout(p=0.0, inplace=False)
+          )
+          (dense): Linear(in_features=4096, out_features=4096, bias=False)
+        )
+        (post_attention_layernorm): RMSNorm()
+        (mlp): MLP(
+          (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
+          (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
+        )
+      )
+    )
+    (final_layernorm): RMSNorm()
+  )
+  (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
+)
+##########
+Embedding(
+  (word_embeddings): Embedding(65024, 4096)
+)
+##########
+Embedding(65024, 4096)
+Reading embedding layer
+##########
+RotaryEmbedding()
+##########
+GLMTransformer(
+  (layers): ModuleList(
+    (0-27): 28 x GLMBlock(
+      (input_layernorm): RMSNorm()
+      (self_attention): SelfAttention(
+        (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
+        (core_attention): CoreAttention(
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (dense): Linear(in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): RMSNorm()
+      (mlp): MLP(
+        (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
+        (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
+      )
+    )
+  )
+  (final_layernorm): RMSNorm()
+)
+##########
+ModuleList(
+  (0-27): 28 x GLMBlock(
+    (input_layernorm): RMSNorm()
+    (self_attention): SelfAttention(
+      (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
+      (core_attention): CoreAttention(
+        (attention_dropout): Dropout(p=0.0, inplace=False)
+      )
+      (dense): Linear(in_features=4096, out_features=4096, bias=False)
+    )
+    (post_attention_layernorm): RMSNorm()
+    (mlp): MLP(
+      (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
+      (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
+    )
+  )
+)
+##########
+GLMBlock(
+  (input_layernorm): RMSNorm()
+  (self_attention): SelfAttention(
+    (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
+    (core_attention): CoreAttention(
+      (attention_dropout): Dropout(p=0.0, inplace=False)
+    )
+    (dense): Linear(in_features=4096, out_features=4096, bias=False)
+  )
+  (post_attention_layernorm): RMSNorm()
+  (mlp): MLP(
+    (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
+    (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
+  )
+)
+Reading decoder layer 0
+['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_allocate_memory', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_is_hf_initialized', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_version', '_wrapped_call_impl', 'add_module', 'apply', 'bfloat16', 'buffers', 'call_super_init', 'children', 'compile', 'core_attention', 'cpu', 'cuda', 'dense', 'double', 'dump_patches', 'eval', 'extra_repr', 'float', 'forward', 'get_buffer', 'get_extra_state', 'get_parameter', 'get_submodule', 'half', 'hidden_size_per_attention_head', 'ipu', 'layer_number', 'load_state_dict', 'modules', 'mtia', 'multi_query_attention', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_attention_heads_per_partition', 'num_multi_query_groups_per_partition', 'parameters', 'projection_size', 'qkv_hidden_size', 'query_key_value', 'register_backward_hook', 'register_buffer', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'set_extra_state', 'set_submodule', 'share_memory', 'state_dict', 'to', 'to_empty', 'train', 'training', 'type', 'xpu', 'zero_grad']
+True
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 1a845cd7c..ee9d4ee41 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -28,6 +28,10 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.intermediate_size = config.intermediate_size
         self.hidden_size = config.hidden_size
         self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads
+        self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads
+        self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads
+        self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False
+        # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245
         self.num_attn_heads = config.num_attention_heads
         self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers
@@ -526,6 +530,14 @@ def make_gather(self, name, inputs, axis):
         self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis)
         self.make_value_info(output, TensorProto.INT64, shape=[])
 
+    def make_split(self, name, inputs, dtype, shape, axis, num_splits):
+        # Splits the input tensor into num_splits based on the axis
+        outputs = [f"{name}/output_{i}" for i in range(num_splits)]
+        split = [num_splits for i in range(num_splits)]
+        self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis, split = split)
+        for output in outputs:
+            self.make_value_info(output, dtype, shape=shape)
+
     def make_reshape(self, name, inputs, dtype, shape):
         output = f"{name}/output_0"
         self.make_node("Reshape", inputs=inputs, outputs=[output], name=name)
@@ -1663,8 +1675,51 @@ def make_gelu(self, layer_id, root_input, activation):
 
         return gelu_name
 
+    def make_swiglu(self, layer_id, root_input, activation, domain):
+        # Make nodes for this activation subgraph
+        #
+        #       root_input (GateProjMatMul)
+        #            /      \
+        #   split/output_0  split/output_1        
+        #         /  |      |
+        #   ActFunc  |      |
+        #          \ |      |
+        #           Mul     |
+        #             \     |
+        #              \    |
+        #                Mul
+        act_name = f"/model/layers.{layer_id}/mlp/act_fn"
+        
+        # Split the input into two parts along the last dimension
+        # When using swiglu the MLP projects to 2 times the intermediate_size
+        split_act_name = f"{act_name}/split"
+        num_splits = 2
+        self.make_split(split_act_name, root_input, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size], axis = -1, num_splits=num_splits)
+        split_act_out_name_0 = f"{split_act_name}/output_0"        
+        split_act_out_name_1 = f"{split_act_name}/output_1"   
+
+        act_name = f"{split_act_name}/{activation}"
+        act_func_output = f"{act_name}/output_0"
+        self.make_node(activation, inputs=[split_act_out_name_0], outputs=[act_func_output], name=act_name, domain=domain)
+        self.make_value_info(act_func_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
+
+        mul_act_name_0 = f"{act_name}/Mul_0"
+        mul_act_inputs_0 = [split_act_out_name_0, act_func_output]
+        self.make_mul(mul_act_name_0, mul_act_inputs_0, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
+
+        mul_act_name_1 = f"{act_name}/Mul_1"
+        mul_0_output = f"{mul_act_name_0}/output_0"
+        mul_act_inputs_1 = [split_act_out_name_1, mul_0_output]
+        self.make_mul(mul_act_name_1, mul_act_inputs_1, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
+
+        return mul_act_name_1
+
+
     def make_activation(self, layer_id, root_input):
-        if self.activation in {"silu", "swish"}:
+
+        if self.activation in {"swiglu"}:
+            output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None)
+        elif self.activation in {"silu", "swish"}:
             output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None)
         elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}:
             output_name = self.make_gelu(layer_id, root_input, activation="FastGelu")
@@ -1752,7 +1807,10 @@ def make_model(self, input_path):
 
         # Loop through model and map each module to ONNX/ORT ops
         self.layer_id = 0
+        model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else ""
         for module in model.modules():
+            print("##########")
+            print(module)
             if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_embeds:
@@ -1770,10 +1828,15 @@ def make_model(self, input_path):
                 self.make_layer(self.layer_id, module)
                 self.layer_id += 1
 
-            elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
+            elif module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers:
+                print(f"Reading decoder layer {self.layer_id}")
+                self.make_layer(self.layer_id, module)
+                self.layer_id += 1
+            
+            elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
-                self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
+                # self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
 
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
@@ -1784,10 +1847,14 @@ def make_model(self, input_path):
 
         del model
 
-    def has_final_norm(self, module, model):
+    def has_final_norm(self, module, model, model_name):
         # Hugging Face names
         hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
-        hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
+        if(model_name == "ChatGLM"):
+            hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
+        else:
+            hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
+        
         # GGUF names
         gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
         return hf_norm or hf_final_layernorm or gguf_final_norm
@@ -2613,6 +2680,158 @@ def make_layer(self, layer_id, layer):
             self.layernorm_attrs["last_layernorm"] = True
 
 
+class ChatGLMModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+        # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
+        # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms
+        self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM
+        self.rotemb_attrs["num_heads"] = self.num_attn_heads
+        self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
+        self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False
+        self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"]
+        
+    def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
+        super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
+    
+    
+    def make_attention(self, layer_id, attention, root_input, **kwargs):
+        #Designed from SelfAttention function of medeling_chatglm.py
+        hidden_size = self.hidden_size
+        num_attention_heads = self.num_attn_heads
+        kv_channels = self.kv_channels
+        head_size = self.head_size
+
+        projection_size = kv_channels * num_attention_heads
+        hidden_size_per_attention_head = projection_size // num_attention_heads
+
+        multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
+        multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
+
+        if multi_query_attention:
+            qkv_hidden_size = projection_size + 2 * hidden_size_per_attention_head * multi_query_group_num
+        else:
+            qkv_hidden_size = 3 * projection_size
+
+        # Reshape the QKV weight
+        qkv_weight = attention.query_key_value.weight.T
+        
+        if multi_query_attention:
+            q_weight, k_weight, v_weight = qkv_weight.split(
+                [
+                    num_attention_heads * hidden_size_per_attention_head,
+                    multi_query_group_num * hidden_size_per_attention_head,
+                    multi_query_group_num * hidden_size_per_attention_head,
+                ],
+                dim=1
+            )
+        else:
+            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=1)
+
+        # Reshape the QKV bias if it exists
+        if attention.query_key_value.bias is not None:
+            qkv_bias = attention.query_key_value.bias
+            if multi_query_attention:
+                q_bias, k_bias, v_bias = qkv_bias.split(
+                    [
+                        num_attention_heads * hidden_size_per_attention_head,
+                        multi_query_group_num * hidden_size_per_attention_head,
+                        multi_query_group_num * hidden_size_per_attention_head,
+                    ]
+                )
+            else:
+                q_bias, k_bias, v_bias = qkv_bias.chunk(3)
+        else:
+            q_bias = k_bias = v_bias = None
+
+        # Create separate Q, K, V projections
+        attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None)
+        attention.q_proj.weight = torch.nn.Parameter(q_weight.T)
+        if q_bias is not None:
+            attention.q_proj.bias = torch.nn.Parameter(q_bias)
+
+        kv_size = multi_query_group_num * hidden_size_per_attention_head
+
+        attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None)
+        attention.k_proj.weight = torch.nn.Parameter(k_weight.T)
+        if k_bias is not None:
+            attention.k_proj.bias = torch.nn.Parameter(k_bias)
+
+        attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None)
+        attention.v_proj.weight = torch.nn.Parameter(v_weight.T)
+        if v_bias is not None:
+            attention.v_proj.bias = torch.nn.Parameter(v_bias)
+
+        # Remove the original combined QKV projection
+        del attention.query_key_value
+        del qkv_weight
+        del qkv_bias
+        # Add dummy rotary_emb attribute
+        attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})()
+
+        super().make_attention(layer_id, attention, root_input, **kwargs)
+
+    def make_mlp_proj(self, layer_id, mlp, root_input):
+        # Make nodes for the MLP subgraph
+        #
+        #           root_input
+        #              |   
+        #         dense_h_to_4h    
+        #              |
+        #           Activation
+        #              |
+        #        dense_4h_to_h  
+        # Make MatMul nodes
+        # gate_basename = f"/model/layers.{layer_id}/mlp/gate_proj/MatMul"
+        # gate_name = self.make_matmul(mlp.gate_proj, gate_basename, root_input)
+        up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
+        up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input)  
+        # Make activation node(s)
+        act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0")  
+        # # Make Mul node after activation
+        # mul_name = f"/model/layers.{layer_id}/mlp/Mul"
+        # mul_inputs = [f"{act_fn_name}/output_0", f"{up_name}/output_0"]
+        # self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) 
+        # Make output MatMul node
+        down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul"
+        down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0")   
+        # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
+        self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
+       
+
+    def make_layer(self, layer_id, layer):
+        # Each GLM encoder is defined as follows all LayerNorms are RMSNorms:
+        # input_layernorm  --> self_attention --> residual_add_pre_input_layernorm --> layernorm --> dense --> residual_add_pre_last_layernorm
+        #TODO: @amd-sudo-sh Add the conditional statement for different residual configuration.
+        root_input = self.layernorm_attrs["root_input"]
+        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="input")
+        self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"])
+        
+        residual_add_name_0 = f"/model/layers.{layer_id}/residual_add/Add_0"
+        residual_add_inputs_0 = [self.layernorm_attrs['skip_input'], root_input]
+        next_residual_input = self.layernorm_attrs['skip_input']
+        self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
+        
+        self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0"
+        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="middle")
+
+        self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input']
+
+        residual_add_name_1 = f"/model/layers.{layer_id}/residual_add/Add_1"
+        residual_add_inputs_1 = [self.layernorm_attrs['skip_input'], next_residual_input]
+        self.make_add(residual_add_name_1, residual_add_inputs_1, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
+
+        self.layernorm_attrs["root_input"] = f"{residual_add_name_1}/output_0"
+        
+        if layer_id == self.num_layers - 1:
+            # Norm after last decoder layer of model (last layer --> norm)
+            self.layernorm_attrs["last_layernorm"] = True
+
+        # Assign output 0 of residual Add as skip input to next SkipLayerNorm
+        self.layernorm_attrs["skip_input"] = f"{residual_add_name_1}/output_0"
+
+
+
 def check_extra_options(kv_pairs):
     if "use_8bits_moe" in kv_pairs:
         assert(kv_pairs["use_8bits_moe"] == "1" or kv_pairs["use_8bits_moe"] == "0"), "use_8bits_moe must be 0 or 1."
@@ -2682,6 +2901,13 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
+        elif config.architectures[0] == "ChatGLMModel":
+            #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way
+            config.max_position_embeddings = config.seq_length # Max sequence length a model can handle
+            config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer
+            config.num_hidden_layers = config.num_layers 
+            config.hidden_act = "swiglu"
+            onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:
             raise NotImplementedError(f"The {hf_name} model is not currently supported.")
 

From a51d036e62d637d70661769064065aa71a24e3d9 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Fri, 13 Sep 2024 21:05:37 +0000
Subject: [PATCH 02/17] builder_changes

---
 src/python/py/models/builder.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index ee9d4ee41..aab232e4b 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -531,10 +531,11 @@ def make_gather(self, name, inputs, axis):
         self.make_value_info(output, TensorProto.INT64, shape=[])
 
     def make_split(self, name, inputs, dtype, shape, axis, num_splits):
+        #TODO: @amd-sudo-sh: Currently it supports num_splits = 2
         # Splits the input tensor into num_splits based on the axis
         outputs = [f"{name}/output_{i}" for i in range(num_splits)]
-        split = [num_splits for i in range(num_splits)]
-        self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis, split = split)
+        # split = [num_splits for i in range(num_splits)]
+        self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis)
         for output in outputs:
             self.make_value_info(output, dtype, shape=shape)
 
@@ -1836,12 +1837,15 @@ def make_model(self, input_path):
             elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
-                # self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
-
+                print(self.layernorm_attrs["root_input"])
+                self.make_layernorm(self.layer_id, module, skip=False, simple=self.layernorm_attrs["simple"], location="final_norm")
+                self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"]
+                print(self.layernorm_attrs["root_input"])
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_lm_head:
                     # Language modeling head (SkipLayerNorm --> logits)
+                    print(self.layernorm_attrs["root_input"])
                     print("Reading LM head")
                     self.make_lm_head(module)
 
@@ -2687,8 +2691,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms
         self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
+        self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration
         self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
         self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False
+        self.attention_attrs["use_rotemb_in_attn"] = True
+        self.attention_attrs["use_packed_matmul"] = True
         self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"]
         
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
@@ -2700,7 +2707,7 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         hidden_size = self.hidden_size
         num_attention_heads = self.num_attn_heads
         kv_channels = self.kv_channels
-        head_size = self.head_size
+        # head_size = self.head_size
 
         projection_size = kv_channels * num_attention_heads
         hidden_size_per_attention_head = projection_size // num_attention_heads
@@ -2723,10 +2730,10 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
                     multi_query_group_num * hidden_size_per_attention_head,
                     multi_query_group_num * hidden_size_per_attention_head,
                 ],
-                dim=1
+                dim=-1
             )
         else:
-            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=1)
+            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1)
 
         # Reshape the QKV bias if it exists
         if attention.query_key_value.bias is not None:
@@ -2782,17 +2789,12 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
         #              |
         #        dense_4h_to_h  
         # Make MatMul nodes
-        # gate_basename = f"/model/layers.{layer_id}/mlp/gate_proj/MatMul"
-        # gate_name = self.make_matmul(mlp.gate_proj, gate_basename, root_input)
+        
         up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
         up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input)  
         # Make activation node(s)
         act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0")  
-        # # Make Mul node after activation
-        # mul_name = f"/model/layers.{layer_id}/mlp/Mul"
-        # mul_inputs = [f"{act_fn_name}/output_0", f"{up_name}/output_0"]
-        # self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) 
-        # Make output MatMul node
+
         down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul"
         down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0")   
         # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
@@ -2813,7 +2815,7 @@ def make_layer(self, layer_id, layer):
         self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
         
         self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0"
-        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="middle")
+        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_attention")
 
         self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input']
 

From bc46b1cac5a1fc06dc16fa461aa27bb720992364 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Wed, 18 Sep 2024 04:24:56 +0000
Subject: [PATCH 03/17] parity_checked_chatglm_model

---
 src/python/py/models/builder.py | 51 +++++++++++----------------------
 1 file changed, 16 insertions(+), 35 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index aab232e4b..8626e0986 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1810,13 +1810,13 @@ def make_model(self, input_path):
         self.layer_id = 0
         model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else ""
         for module in model.modules():
-            print("##########")
-            print(module)
+            # print("##########")
+            # print(module)
             if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_embeds:
                     # Embedding layer
-                    print("Reading embedding layer")
+                    # print("Reading embedding layer")
                     self.make_embedding(module.weight.detach().numpy())
                 else:
                     # Exclude embedding layer from model
@@ -1837,15 +1837,15 @@ def make_model(self, input_path):
             elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
-                print(self.layernorm_attrs["root_input"])
-                self.make_layernorm(self.layer_id, module, skip=False, simple=self.layernorm_attrs["simple"], location="final_norm")
-                self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"]
-                print(self.layernorm_attrs["root_input"])
+                # print(self.layernorm_attrs["root_input"])
+                self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
+                # self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"]
+                # print(self.layernorm_attrs["root_input"])
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_lm_head:
                     # Language modeling head (SkipLayerNorm --> logits)
-                    print(self.layernorm_attrs["root_input"])
+                    # print(self.layernorm_attrs["root_input"])
                     print("Reading LM head")
                     self.make_lm_head(module)
 
@@ -2688,11 +2688,11 @@ class ChatGLMModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
-        # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms
         self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
         self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration
         self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
+        self.rotemb_attrs["interleaved"] = 1
         self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False
         self.attention_attrs["use_rotemb_in_attn"] = True
         self.attention_attrs["use_packed_matmul"] = True
@@ -2783,11 +2783,12 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
         #
         #           root_input
         #              |   
-        #         dense_h_to_4h    
+        #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h
         #              |
         #           Activation
         #              |
         #        dense_4h_to_h  
+        #
         # Make MatMul nodes
         
         up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
@@ -2802,37 +2803,17 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
        
 
     def make_layer(self, layer_id, layer):
-        # Each GLM encoder is defined as follows all LayerNorms are RMSNorms:
-        # input_layernorm  --> self_attention --> residual_add_pre_input_layernorm --> layernorm --> dense --> residual_add_pre_last_layernorm
-        #TODO: @amd-sudo-sh Add the conditional statement for different residual configuration.
-        root_input = self.layernorm_attrs["root_input"]
-        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="input")
+        # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model):
+        self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
         self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"])
-        
-        residual_add_name_0 = f"/model/layers.{layer_id}/residual_add/Add_0"
-        residual_add_inputs_0 = [self.layernorm_attrs['skip_input'], root_input]
-        next_residual_input = self.layernorm_attrs['skip_input']
-        self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
-        
-        self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0"
-        self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_attention")
-
-        self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input']
-
-        residual_add_name_1 = f"/model/layers.{layer_id}/residual_add/Add_1"
-        residual_add_inputs_1 = [self.layernorm_attrs['skip_input'], next_residual_input]
-        self.make_add(residual_add_name_1, residual_add_inputs_1, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
+        self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
+        self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"])
 
-        self.layernorm_attrs["root_input"] = f"{residual_add_name_1}/output_0"
-        
+        self.layernorm_attrs["first_layernorm"] = False
         if layer_id == self.num_layers - 1:
             # Norm after last decoder layer of model (last layer --> norm)
             self.layernorm_attrs["last_layernorm"] = True
 
-        # Assign output 0 of residual Add as skip input to next SkipLayerNorm
-        self.layernorm_attrs["skip_input"] = f"{residual_add_name_1}/output_0"
-
-
 
 def check_extra_options(kv_pairs):
     if "use_8bits_moe" in kv_pairs:

From 1c90219df0dc16d68ec1050ab30c948826da6f51 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 19 Sep 2024 17:40:21 +0000
Subject: [PATCH 04/17] refractor_codebase

---
 src/python/py/models/ChatGLM_modules.log | 128 -----------------------
 src/python/py/models/builder.py          |  56 +++++-----
 2 files changed, 26 insertions(+), 158 deletions(-)
 delete mode 100644 src/python/py/models/ChatGLM_modules.log

diff --git a/src/python/py/models/ChatGLM_modules.log b/src/python/py/models/ChatGLM_modules.log
deleted file mode 100644
index 03732df1b..000000000
--- a/src/python/py/models/ChatGLM_modules.log
+++ /dev/null
@@ -1,128 +0,0 @@
-Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, INT4 CPU, INT4 CUDA, INT4 DML
-Extra options: {}
-GroupQueryAttention (GQA) is used in this model.
-##########
-ChatGLMForConditionalGeneration(
-  (transformer): ChatGLMModel(
-    (embedding): Embedding(
-      (word_embeddings): Embedding(65024, 4096)
-    )
-    (rotary_pos_emb): RotaryEmbedding()
-    (encoder): GLMTransformer(
-      (layers): ModuleList(
-        (0-27): 28 x GLMBlock(
-          (input_layernorm): RMSNorm()
-          (self_attention): SelfAttention(
-            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
-            (core_attention): CoreAttention(
-              (attention_dropout): Dropout(p=0.0, inplace=False)
-            )
-            (dense): Linear(in_features=4096, out_features=4096, bias=False)
-          )
-          (post_attention_layernorm): RMSNorm()
-          (mlp): MLP(
-            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
-            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
-          )
-        )
-      )
-      (final_layernorm): RMSNorm()
-    )
-    (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
-  )
-)
-##########
-ChatGLMModel(
-  (embedding): Embedding(
-    (word_embeddings): Embedding(65024, 4096)
-  )
-  (rotary_pos_emb): RotaryEmbedding()
-  (encoder): GLMTransformer(
-    (layers): ModuleList(
-      (0-27): 28 x GLMBlock(
-        (input_layernorm): RMSNorm()
-        (self_attention): SelfAttention(
-          (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
-          (core_attention): CoreAttention(
-            (attention_dropout): Dropout(p=0.0, inplace=False)
-          )
-          (dense): Linear(in_features=4096, out_features=4096, bias=False)
-        )
-        (post_attention_layernorm): RMSNorm()
-        (mlp): MLP(
-          (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
-          (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
-        )
-      )
-    )
-    (final_layernorm): RMSNorm()
-  )
-  (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
-)
-##########
-Embedding(
-  (word_embeddings): Embedding(65024, 4096)
-)
-##########
-Embedding(65024, 4096)
-Reading embedding layer
-##########
-RotaryEmbedding()
-##########
-GLMTransformer(
-  (layers): ModuleList(
-    (0-27): 28 x GLMBlock(
-      (input_layernorm): RMSNorm()
-      (self_attention): SelfAttention(
-        (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
-        (core_attention): CoreAttention(
-          (attention_dropout): Dropout(p=0.0, inplace=False)
-        )
-        (dense): Linear(in_features=4096, out_features=4096, bias=False)
-      )
-      (post_attention_layernorm): RMSNorm()
-      (mlp): MLP(
-        (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
-        (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
-      )
-    )
-  )
-  (final_layernorm): RMSNorm()
-)
-##########
-ModuleList(
-  (0-27): 28 x GLMBlock(
-    (input_layernorm): RMSNorm()
-    (self_attention): SelfAttention(
-      (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
-      (core_attention): CoreAttention(
-        (attention_dropout): Dropout(p=0.0, inplace=False)
-      )
-      (dense): Linear(in_features=4096, out_features=4096, bias=False)
-    )
-    (post_attention_layernorm): RMSNorm()
-    (mlp): MLP(
-      (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
-      (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
-    )
-  )
-)
-##########
-GLMBlock(
-  (input_layernorm): RMSNorm()
-  (self_attention): SelfAttention(
-    (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
-    (core_attention): CoreAttention(
-      (attention_dropout): Dropout(p=0.0, inplace=False)
-    )
-    (dense): Linear(in_features=4096, out_features=4096, bias=False)
-  )
-  (post_attention_layernorm): RMSNorm()
-  (mlp): MLP(
-    (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
-    (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
-  )
-)
-Reading decoder layer 0
-['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_allocate_memory', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_is_hf_initialized', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_version', '_wrapped_call_impl', 'add_module', 'apply', 'bfloat16', 'buffers', 'call_super_init', 'children', 'compile', 'core_attention', 'cpu', 'cuda', 'dense', 'double', 'dump_patches', 'eval', 'extra_repr', 'float', 'forward', 'get_buffer', 'get_extra_state', 'get_parameter', 'get_submodule', 'half', 'hidden_size_per_attention_head', 'ipu', 'layer_number', 'load_state_dict', 'modules', 'mtia', 'multi_query_attention', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_attention_heads_per_partition', 'num_multi_query_groups_per_partition', 'parameters', 'projection_size', 'qkv_hidden_size', 'query_key_value', 'register_backward_hook', 'register_buffer', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'set_extra_state', 'set_submodule', 'share_memory', 'state_dict', 'to', 'to_empty', 'train', 'training', 'type', 'xpu', 'zero_grad']
-True
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 8626e0986..89c531683 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -21,11 +21,11 @@
 
 
 class Model:
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
-        self.context_length = config.max_position_embeddings
-        self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): 
+        self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length
+        self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length
         self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1  # default is -1 in GroupQueryAttention kernel
-        self.intermediate_size = config.intermediate_size
+        self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size
         self.hidden_size = config.hidden_size
         self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads
         self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads
@@ -34,7 +34,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245
         self.num_attn_heads = config.num_attention_heads
         self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
-        self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers
+        self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers
         self.vocab_size = config.vocab_size
         self.activation = config.hidden_activation if hasattr(config, "hidden_activation") and config.hidden_activation is not None else config.hidden_act
 
@@ -1808,15 +1808,13 @@ def make_model(self, input_path):
 
         # Loop through model and map each module to ONNX/ORT ops
         self.layer_id = 0
-        model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else ""
         for module in model.modules():
-            # print("##########")
-            # print(module)
+
             if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_embeds:
                     # Embedding layer
-                    # print("Reading embedding layer")
+                    print("Reading embedding layer")
                     self.make_embedding(module.weight.detach().numpy())
                 else:
                     # Exclude embedding layer from model
@@ -1834,30 +1832,25 @@ def make_model(self, input_path):
                 self.make_layer(self.layer_id, module)
                 self.layer_id += 1
             
-            elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name):
+            elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
-                # print(self.layernorm_attrs["root_input"])
                 self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
-                # self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"]
-                # print(self.layernorm_attrs["root_input"])
+            
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_lm_head:
                     # Language modeling head (SkipLayerNorm --> logits)
-                    # print(self.layernorm_attrs["root_input"])
                     print("Reading LM head")
                     self.make_lm_head(module)
 
         del model
 
-    def has_final_norm(self, module, model, model_name):
+    def has_final_norm(self, module, model):
         # Hugging Face names
         hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
-        if(model_name == "ChatGLM"):
-            hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
-        else:
-            hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
+        
+        hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
         
         # GGUF names
         gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
@@ -2697,7 +2690,15 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.attention_attrs["use_rotemb_in_attn"] = True
         self.attention_attrs["use_packed_matmul"] = True
         self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"]
-        
+    
+    def has_final_norm(self, module, model):
+        # Hugging Face names
+        hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
+        hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
+        # GGUF names
+        gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
+        return hf_norm or hf_final_layernorm or gguf_final_norm
+
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
     
@@ -2715,11 +2716,6 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
         multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
 
-        if multi_query_attention:
-            qkv_hidden_size = projection_size + 2 * hidden_size_per_attention_head * multi_query_group_num
-        else:
-            qkv_hidden_size = 3 * projection_size
-
         # Reshape the QKV weight
         qkv_weight = attention.query_key_value.weight.T
         
@@ -2783,7 +2779,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
         #
         #           root_input
         #              |   
-        #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h
+        #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same
         #              |
         #           Activation
         #              |
@@ -2885,10 +2881,10 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "ChatGLMModel":
-            #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way
-            config.max_position_embeddings = config.seq_length # Max sequence length a model can handle
-            config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer
-            config.num_hidden_layers = config.num_layers 
+            # #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way
+            # config.max_position_embeddings = config.seq_length # Max sequence length a model can handle
+            # config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer
+            # config.num_hidden_layers = config.num_layers 
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:

From dfdbf4f60883c8bc98a09543eb9332c7aced42ff Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 19 Sep 2024 17:44:48 +0000
Subject: [PATCH 05/17] refractor_codebase

---
 src/python/py/models/builder.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 89c531683..c5f8f297e 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -2881,10 +2881,6 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "ChatGLMModel":
-            # #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way
-            # config.max_position_embeddings = config.seq_length # Max sequence length a model can handle
-            # config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer
-            # config.num_hidden_layers = config.num_layers 
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:

From b20e07b70b4f57b3ff796dfe1da0ab546af99c05 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 19 Sep 2024 18:21:39 +0000
Subject: [PATCH 06/17] chatglm_support_for_model_builder

---
 src/python/py/models/builder.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index c5f8f297e..5e31c55f9 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -531,10 +531,8 @@ def make_gather(self, name, inputs, axis):
         self.make_value_info(output, TensorProto.INT64, shape=[])
 
     def make_split(self, name, inputs, dtype, shape, axis, num_splits):
-        #TODO: @amd-sudo-sh: Currently it supports num_splits = 2
-        # Splits the input tensor into num_splits based on the axis
+        # Splits the input tensor into num_splits based on the axis and shape
         outputs = [f"{name}/output_{i}" for i in range(num_splits)]
-        # split = [num_splits for i in range(num_splits)]
         self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis)
         for output in outputs:
             self.make_value_info(output, dtype, shape=shape)
@@ -1836,7 +1834,6 @@ def make_model(self, input_path):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
                 self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
-            
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_lm_head:
@@ -1849,9 +1846,7 @@ def make_model(self, input_path):
     def has_final_norm(self, module, model):
         # Hugging Face names
         hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
-        
         hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
-        
         # GGUF names
         gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
         return hf_norm or hf_final_layernorm or gguf_final_norm
@@ -2708,11 +2703,8 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         hidden_size = self.hidden_size
         num_attention_heads = self.num_attn_heads
         kv_channels = self.kv_channels
-        # head_size = self.head_size
-
         projection_size = kv_channels * num_attention_heads
         hidden_size_per_attention_head = projection_size // num_attention_heads
-
         multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
         multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
 

From 0bdf8434af5157ab1a79a3a8a913b3d802c43132 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowenbao@amd.com>
Date: Sun, 22 Sep 2024 17:07:18 +0000
Subject: [PATCH 07/17] complete chatglm3

---
 src/python/py/models/builder.py         | 262 +++++++++++++-----------
 src/python/py/models/quantized_model.py | 238 +++++++++++++++++----
 2 files changed, 344 insertions(+), 156 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 5e31c55f9..2cc70d1b6 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1443,14 +1443,21 @@ def make_mlp(self, layer_id, mlp, root_input):
             raise NotImplementedError(f"The MLP layer type is not set.")
 
     def make_mlp_unpacked(self, layer_id, mlp, root_input):
+        packed_proj = getattr(mlp, "gate_up_proj", None) or getattr(
+            mlp, "dense_h_to_4h", None
+        )
         mlp.gate_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size)
-        mlp.gate_proj.weight = torch.nn.Parameter(mlp.gate_up_proj.weight[ : self.intermediate_size, :])
+        mlp.gate_proj.weight = torch.nn.Parameter(
+            packed_proj.weight[: self.intermediate_size, :]
+        )
 
         mlp.up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size)
-        mlp.up_proj.weight = torch.nn.Parameter(mlp.gate_up_proj.weight[self.intermediate_size :, :])
+        mlp.up_proj.weight = torch.nn.Parameter(
+            packed_proj.weight[self.intermediate_size :, :]
+        )
 
         # Delete original packed weights
-        del mlp.gate_up_proj
+        del packed_proj
 
     def make_mlp_proj(self, layer_id, mlp, root_input):
         # Make nodes for the MLP subgraph
@@ -1480,8 +1487,11 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
         self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
 
         # Make output MatMul node
+        down_proj = getattr(mlp, "down_proj", None) or getattr(
+            mlp, "dense_4h_to_h", None
+        )
         down_basename = f"/model/layers.{layer_id}/mlp/down_proj/MatMul"
-        down_name = self.make_matmul(mlp.down_proj, down_basename, f"{mul_name}/output_0")
+        down_name = self.make_matmul(down_proj, down_basename, f"{mul_name}/output_0")
 
         # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
         self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
@@ -1679,7 +1689,7 @@ def make_swiglu(self, layer_id, root_input, activation, domain):
         #
         #       root_input (GateProjMatMul)
         #            /      \
-        #   split/output_0  split/output_1        
+        #   split/output_0  split/output_1
         #         /  |      |
         #   ActFunc  |      |
         #          \ |      |
@@ -1688,7 +1698,7 @@ def make_swiglu(self, layer_id, root_input, activation, domain):
         #              \    |
         #                Mul
         act_name = f"/model/layers.{layer_id}/mlp/act_fn"
-        
+
         # Split the input into two parts along the last dimension
         # When using swiglu the MLP projects to 2 times the intermediate_size
         split_act_name = f"{act_name}/split"
@@ -1713,12 +1723,11 @@ def make_swiglu(self, layer_id, root_input, activation, domain):
 
         return mul_act_name_1
 
-
     def make_activation(self, layer_id, root_input):
 
-        if self.activation in {"swiglu"}:
-            output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None)
-        elif self.activation in {"silu", "swish"}:
+        # if self.activation in {"swiglu"}:
+        #     output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None)
+        if self.activation in {"silu", "swish", "swiglu"}:
             output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None)
         elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}:
             output_name = self.make_gelu(layer_id, root_input, activation="FastGelu")
@@ -1798,7 +1807,18 @@ def make_model(self, input_path):
                 from onnxruntime_genai.models.quantized_model import QuantModel
             q_size = self.num_attn_heads * self.head_size
             kv_size = self.num_kv_heads * self.head_size
-            model = QuantModel.from_pretrained(self.quant_type, input_path, self.quant_attrs["bits"], self.quant_attrs["group_size"], self.quant_attrs["use_g_idx"], q_size, kv_size, self.intermediate_size, self.num_layers)
+            model = QuantModel.from_pretrained(
+                self.quant_type,
+                input_path,
+                self.quant_attrs["bits"],
+                self.quant_attrs["group_size"],
+                self.quant_attrs["use_g_idx"],
+                q_size,
+                kv_size,
+                self.intermediate_size,
+                self.num_layers,
+                self.model_type,
+            )
         else:
             # Load PyTorch model
             extra_kwargs = {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {}
@@ -1829,7 +1849,7 @@ def make_model(self, input_path):
                 print(f"Reading decoder layer {self.layer_id}")
                 self.make_layer(self.layer_id, module)
                 self.layer_id += 1
-            
+
             elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
@@ -2685,7 +2705,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.attention_attrs["use_rotemb_in_attn"] = True
         self.attention_attrs["use_packed_matmul"] = True
         self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"]
-    
+
     def has_final_norm(self, module, model):
         # Hugging Face names
         hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
@@ -2696,111 +2716,119 @@ def has_final_norm(self, module, model):
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
-    
-    
-    def make_attention(self, layer_id, attention, root_input, **kwargs):
-        #Designed from SelfAttention function of medeling_chatglm.py
-        hidden_size = self.hidden_size
-        num_attention_heads = self.num_attn_heads
-        kv_channels = self.kv_channels
-        projection_size = kv_channels * num_attention_heads
-        hidden_size_per_attention_head = projection_size // num_attention_heads
-        multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
-        multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
-
-        # Reshape the QKV weight
-        qkv_weight = attention.query_key_value.weight.T
-        
-        if multi_query_attention:
-            q_weight, k_weight, v_weight = qkv_weight.split(
-                [
-                    num_attention_heads * hidden_size_per_attention_head,
-                    multi_query_group_num * hidden_size_per_attention_head,
-                    multi_query_group_num * hidden_size_per_attention_head,
-                ],
-                dim=-1
-            )
-        else:
-            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1)
-
-        # Reshape the QKV bias if it exists
-        if attention.query_key_value.bias is not None:
-            qkv_bias = attention.query_key_value.bias
-            if multi_query_attention:
-                q_bias, k_bias, v_bias = qkv_bias.split(
-                    [
-                        num_attention_heads * hidden_size_per_attention_head,
-                        multi_query_group_num * hidden_size_per_attention_head,
-                        multi_query_group_num * hidden_size_per_attention_head,
-                    ]
-                )
-            else:
-                q_bias, k_bias, v_bias = qkv_bias.chunk(3)
-        else:
-            q_bias = k_bias = v_bias = None
-
-        # Create separate Q, K, V projections
-        attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None)
-        attention.q_proj.weight = torch.nn.Parameter(q_weight.T)
-        if q_bias is not None:
-            attention.q_proj.bias = torch.nn.Parameter(q_bias)
-
-        kv_size = multi_query_group_num * hidden_size_per_attention_head
-
-        attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None)
-        attention.k_proj.weight = torch.nn.Parameter(k_weight.T)
-        if k_bias is not None:
-            attention.k_proj.bias = torch.nn.Parameter(k_bias)
-
-        attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None)
-        attention.v_proj.weight = torch.nn.Parameter(v_weight.T)
-        if v_bias is not None:
-            attention.v_proj.bias = torch.nn.Parameter(v_bias)
 
-        # Remove the original combined QKV projection
-        del attention.query_key_value
-        del qkv_weight
-        del qkv_bias
-        # Add dummy rotary_emb attribute
-        attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})()
-
-        super().make_attention(layer_id, attention, root_input, **kwargs)
+    def make_attention(self, layer_id, attention, root_input, **kwargs):
+        if self.quant_type is None:
+            super().make_attention_unpacked(layer_id, attention, root_input, **kwargs)
+        return super().make_attention(layer_id, attention, root_input, **kwargs)
+
+    # def make_attention(self, layer_id, attention, root_input, **kwargs):
+    #     #Designed from SelfAttention function of medeling_chatglm.py
+    #     hidden_size = self.hidden_size
+    #     num_attention_heads = self.num_attn_heads
+    #     kv_channels = self.kv_channels
+    #     projection_size = kv_channels * num_attention_heads
+    #     hidden_size_per_attention_head = projection_size // num_attention_heads
+    #     multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
+    #     multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
+
+    #     # Reshape the QKV weight
+    #     qkv_weight = attention.query_key_value.weight.T
+
+    #     if multi_query_attention:
+    #         q_weight, k_weight, v_weight = qkv_weight.split(
+    #             [
+    #                 num_attention_heads * hidden_size_per_attention_head,
+    #                 multi_query_group_num * hidden_size_per_attention_head,
+    #                 multi_query_group_num * hidden_size_per_attention_head,
+    #             ],
+    #             dim=-1
+    #         )
+    #     else:
+    #         q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1)
+
+    #     # Reshape the QKV bias if it exists
+    #     if attention.query_key_value.bias is not None:
+    #         qkv_bias = attention.query_key_value.bias
+    #         if multi_query_attention:
+    #             q_bias, k_bias, v_bias = qkv_bias.split(
+    #                 [
+    #                     num_attention_heads * hidden_size_per_attention_head,
+    #                     multi_query_group_num * hidden_size_per_attention_head,
+    #                     multi_query_group_num * hidden_size_per_attention_head,
+    #                 ]
+    #             )
+    #         else:
+    #             q_bias, k_bias, v_bias = qkv_bias.chunk(3)
+    #     else:
+    #         q_bias = k_bias = v_bias = None
+
+    #     # Create separate Q, K, V projections
+    #     attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None)
+    #     attention.q_proj.weight = torch.nn.Parameter(q_weight.T)
+    #     if q_bias is not None:
+    #         attention.q_proj.bias = torch.nn.Parameter(q_bias)
+
+    #     kv_size = multi_query_group_num * hidden_size_per_attention_head
+
+    #     attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None)
+    #     attention.k_proj.weight = torch.nn.Parameter(k_weight.T)
+    #     if k_bias is not None:
+    #         attention.k_proj.bias = torch.nn.Parameter(k_bias)
+
+    #     attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None)
+    #     attention.v_proj.weight = torch.nn.Parameter(v_weight.T)
+    #     if v_bias is not None:
+    #         attention.v_proj.bias = torch.nn.Parameter(v_bias)
+
+    #     # Remove the original combined QKV projection
+    #     del attention.query_key_value
+    #     del qkv_weight
+    #     del qkv_bias
+    #     # Add dummy rotary_emb attribute
+    #     attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})()
+
+    #     super().make_attention(layer_id, attention, root_input, **kwargs)
 
     def make_mlp_proj(self, layer_id, mlp, root_input):
-        # Make nodes for the MLP subgraph
-        #
-        #           root_input
-        #              |   
-        #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same
-        #              |
-        #           Activation
-        #              |
-        #        dense_4h_to_h  
-        #
-        # Make MatMul nodes
-        
-        up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
-        up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input)  
-        # Make activation node(s)
-        act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0")  
-
-        down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul"
-        down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0")   
-        # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
-        self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
-       
-
-    def make_layer(self, layer_id, layer):
-        # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model):
-        self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
-        self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"])
-        self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
-        self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"])
+        if self.quant_type is None:
+            super().make_mlp_unpacked(layer_id, mlp, root_input)
+        super().make_mlp_proj(layer_id, mlp, root_input)
 
-        self.layernorm_attrs["first_layernorm"] = False
-        if layer_id == self.num_layers - 1:
-            # Norm after last decoder layer of model (last layer --> norm)
-            self.layernorm_attrs["last_layernorm"] = True
+    # def make_mlp_proj(self, layer_id, mlp, root_input):
+    #     # Make nodes for the MLP subgraph
+    #     #
+    #     #           root_input
+    #     #              |
+    #     #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same
+    #     #              |
+    #     #           Activation
+    #     #              |
+    #     #        dense_4h_to_h
+    #     #
+    #     # Make MatMul nodes
+
+    #     up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
+    #     up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input)
+    #     # Make activation node(s)
+    #     act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0")
+
+    #     down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul"
+    #     down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0")
+    #     # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
+    #     self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
+
+    # def make_layer(self, layer_id, layer):
+    #     # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model):
+    #     self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
+    #     self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"])
+    #     self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
+    #     self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"])
+
+    #     self.layernorm_attrs["first_layernorm"] = False
+    #     if layer_id == self.num_layers - 1:
+    #         # Norm after last decoder layer of model (last layer --> norm)
+    #         self.layernorm_attrs["last_layernorm"] = True
 
 
 def check_extra_options(kv_pairs):
@@ -2872,11 +2900,13 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
-        elif config.architectures[0] == "ChatGLMModel":
+        elif config.architectures[0] == "ChatGLMForConditionalGeneration":
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:
-            raise NotImplementedError(f"The {hf_name} model is not currently supported.")
+            raise NotImplementedError(
+                f"The {hf_name} model is not currently supported. Got {config}"
+            )
 
         # Make ONNX model
         onnx_model.make_model(input_path)
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
index f15f21cb9..f2717a3ee 100644
--- a/src/python/py/models/quantized_model.py
+++ b/src/python/py/models/quantized_model.py
@@ -32,6 +32,16 @@ def __init__(self, bits, group_size):
         self.bits = bits
         self.group_size = group_size
 
+    def set_properties(self, quant_type: str):
+        if quant_type == "awq":
+            self.out_features = self.scales.shape[1]
+            self.in_features = self.qweight.shape[0]
+        elif quant_type == "gptq":
+            self.out_features = self.qweight.shape[1]
+            self.in_features = self.q_proj.g_idx.shape[0]
+        else:
+            raise NotImplementedError(f"The {quant_type} quantization method is not recognized.")
+
     def __str__(self):
         qweight = f"qweight = {self.qweight.shape}, {self.qweight}\n"
         scales = f"scales = {self.scales.shape}, {self.scales}\n"
@@ -60,6 +70,21 @@ def __init__(self, bits, group_size):
         self.o_proj = QuantizedTensorModule(bits, group_size)
         self.rotary_emb = TensorModule()
 
+    def set_properties(self, quant_type: str):
+        self.q_proj.set_properties(quant_type)
+        self.k_proj.set_properties(quant_type)
+        self.v_proj.set_properties(quant_type)
+        self.o_proj.set_properties(quant_type)
+
+class QuantizedChatglm3Attention:
+    def __init__(self, bits, group_size):
+        self.query_key_value = QuantizedTensorModule(bits, group_size)
+        self.dense = QuantizedTensorModule(bits, group_size)
+        self.rotary_emb = TensorModule()
+
+    def set_properties(self, quant_type: str):
+        self.query_key_value.set_properties(quant_type)
+        self.dense.set_properties(quant_type)
 
 class QuantizedMLP:
     def __init__(self, bits, group_size):
@@ -69,6 +94,21 @@ def __init__(self, bits, group_size):
         self.fc1 = QuantizedTensorModule(bits, group_size)
         self.fc2 = QuantizedTensorModule(bits, group_size)
 
+    def set_properties(self, quant_type: str):
+        self.gate_proj.set_properties(quant_type)
+        self.up_proj.set_properties(quant_type)
+        self.down_proj.set_properties(quant_type)
+        self.fc1.set_properties(quant_type)
+        self.fc2.set_properties(quant_type)
+
+class QuantizedChatglm3MLP:
+    def __init__(self, bits, group_size):
+        self.dense_4h_to_h = QuantizedTensorModule(bits, group_size)
+        self.dense_h_to_4h = QuantizedTensorModule(bits, group_size)
+
+    def set_properties(self, quant_type: str):
+        self.dense_4h_to_h.set_properties(quant_type)
+        self.dense_h_to_4h.set_properties(quant_type)
 
 class QuantizedDecoderLayer:
     def __init__(self, layer_id, bits, group_size):
@@ -81,9 +121,27 @@ def __init__(self, layer_id, bits, group_size):
     def is_empty(self):
         return self.input_layernorm.weight is None
 
+    def set_properties(self, quant_type: str):
+        self.self_attn.set_properties(quant_type)
+        self.mlp.set_properties(quant_type)
+
+class QuantizedChatglm3EncoderLayer:
+    def __init__(self, layer_id, bits, group_size):
+        self.layer_id = layer_id
+        self.input_layernorm = TensorModule()
+        self.self_attention = QuantizedChatglm3Attention(bits, group_size)
+        self.post_attention_layernorm = TensorModule()
+        self.mlp = QuantizedChatglm3MLP(bits, group_size)
+
+    def is_empty(self):
+        return self.input_layernorm.weight is None
+
+    def set_properties(self, quant_type: str):
+        self.self_attention.set_properties(quant_type)
+        self.mlp.set_properties(quant_type)
 
 class QuantizedModel:
-    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers):
+    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str):
         self.quant_type = quant_type
         self.embedding = TensorModule()
         self.final_norm = TensorModule()
@@ -91,34 +149,48 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
         self.layers = {}
         self.num_layers = num_layers
 
+        q_layer_cls = QuantizedDecoderLayer
+        # if model_type == "ChatGLMForConditionalGeneration":
+        #     q_layer_cls = QuantizedChatglm3EncoderLayer
+        # print(q_layer_cls)
+
         layer_id = 0
         for weight_file in os.listdir(input_path):
             if weight_file.endswith(".safetensors"):
-                module = self.layers.setdefault(layer_id, QuantizedDecoderLayer(layer_id, bits, group_size))
+                module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size))
                 weights = load_file(os.path.join(input_path, weight_file))
+                for name, _ in weights.items():
+                    print(name)
 
                 # Map weights to modules
                 for name, tensor in weights.items():
+                    print(name)
                     if tensor.dtype == torch.bfloat16:
                         # Cast bfloat16 to float32 since NumPy does not support bfloat16
                         tensor = tensor.to(torch.float32)
-
-                    if name == "model.embed_tokens.weight":
+                    if name == "model.embed_tokens.weight" or name == "transformer.embedding.word_embeddings.weight":
                         self.embedding.weight = tensor
-                    elif name == "model.norm.weight":
+                    elif name == "model.norm.weight" or name == "transformer.encoder.final_layernorm.weight":
                         self.final_norm.weight = tensor
-                    elif name == "model.norm.bias":
+                    elif name == "model.norm.bias" or name == "transformer.encoder.final_layernorm.bias":
                         self.final_norm.bias = tensor
-                    elif name == "lm_head.weight":
+                    elif name == "lm_head.weight" or name == "transformer.output_layer.weight":
                         self.lm_head.weight = tensor
-                    elif name == "lm_head.bias":
+                    elif name == "lm_head.bias" or name == "transformer.output_layer.bias":
                         self.lm_head.bias = tensor
+                    elif name == "transformer.rotary_pos_emb.inv_freq":
+                        # transformer.rotary_pos_emb.inv_freq in ChatGLM3.
+                        # Skip rotary embedding weights since they can be re-calculated when looping through the model
+                        continue
                     else:
+                        if name.startswith("transformer.encoder"):
+                            # Chatglm3, e.g., transformer.encoder.layers.0.input_layernorm.weight
+                            name = name.replace("transformer.encoder", "model")
                         curr_layer_id = int(name.split(".")[2])
                         if curr_layer_id != layer_id:
                             # Switch layer module used
                             layer_id = curr_layer_id
-                            module = self.layers.setdefault(layer_id, QuantizedDecoderLayer(layer_id, bits, group_size))
+                            module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size))
 
                         # Map weights and biases of norm, attention, and feed-forward network
                         # Graph order is input_layernorm --> q_proj/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj
@@ -177,27 +249,92 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.bias$", name)):
                             # model.layers.layer_id.self_attn.v_proj.bias
                             module.self_attn.v_proj.bias = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.qweight$", name)):
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qweight$", name)):
+                        #     # model.layers.layer_id.self_attention.query_key_value.qweight
+                        #     module.self_attention.query_key_value.qweight = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.scales$", name)):
+                        #     # model.layers.layer_id.self_attention.query_key_value.scales
+                        #     module.self_attention.query_key_value.scales = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qzeros$", name)):
+                        #     # model.layers.layer_id.self_attention.query_key_value.qzeros
+                        #     module.self_attention.query_key_value.qzeros = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.g_idx$", name)):
+                        #     # model.layers.layer_id.self_attention.query_key_value.g_idx
+                        #     module.self_attention.query_key_value.g_idx = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.bias$", name)):
+                        #     # model.layers.layer_id.self_attention.query_key_value.bias
+                        #     module.self_attention.query_key_value.bias = tensor
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qweight$", name)):
                             # model.layers.layer_id.self_attn.o_proj.qweight
+                            # model.layers.layer_id.self_attention.dense.qweight
                             module.self_attn.o_proj.qweight = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.scales$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.scales$", name)):
                             # model.layers.layer_id.self_attn.o_proj.scales
+                            # model.layers.layer_id.self_attention.dense.scales
                             module.self_attn.o_proj.scales = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.qzeros$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qzeros$", name)):
                             # model.layers.layer_id.self_attn.o_proj.qzeros
+                            # model.layers.layer_id.self_attention.dense.qzeros
                             module.self_attn.o_proj.qzeros = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.g_idx$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.g_idx$", name)):
                             # model.layers.layer_id.self_attn.o_proj.g_idx
+                            # model.layers.layer_id.self_attention.dense.g_idx
                             module.self_attn.o_proj.g_idx = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.bias$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.bias$", name)):
                             # model.layers.layer_id.self_attn.o_proj.bias
+                            # model.layers.layer_id.self_attention.dense.bias
                             module.self_attn.o_proj.bias = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qweight$", name)):
+                        #     # model.layers.layer_id.self_attention.dense.qweight
+                        #     module.self_attention.dense.qweight = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.scales$", name)):
+                        #     # model.layers.layer_id.self_attention.dense.scales
+                        #     module.self_attention.dense.scales = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qzeros$", name)):
+                        #     # model.layers.layer_id.self_attention.dense.qzeros
+                        #     module.self_attention.dense.qzeros = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.g_idx$", name)):
+                        #     # model.layers.layer_id.self_attention.dense.g_idx
+                        #     module.self_attention.dense.g_idx = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.bias$", name)):
+                        #     # model.layers.layer_id.self_attention.dense.bias
+                        #     module.self_attention.dense.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.weight$", name)):
                             # model.layers.layer_id.post_attention_layernorm.weight
                             module.post_attention_layernorm.weight = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.bias$", name)):
                             # model.layers.layer_id.post_attention_layernorm.bias
                             module.post_attention_layernorm.bias = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qweight$", name)):
+                        #     # model.layers.layer_id.mlp.dense_4h_to_h.qweight
+                        #     module.mlp.dense_4h_to_h.qweight = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.scales$", name)):
+                        #     # model.layers.layer_id.mlp.dense_4h_to_h.scales
+                        #     module.mlp.dense_4h_to_h.scales = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qzeros$", name)):
+                        #     # model.layers.layer_id.mlp.dense_4h_to_h.qzeros
+                        #     module.mlp.dense_4h_to_h.qzeros = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.g_idx$", name)):
+                        #     # model.layers.layer_id.mlp.dense_4h_to_h.g_idx
+                        #     module.mlp.dense_4h_to_h.g_idx = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.bias$", name)):
+                        #     # model.layers.layer_id.mlp.dense_4h_to_h.bias
+                        #     module.mlp.dense_4h_to_h.bias = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qweight$", name)):
+                        #     # model.layers.layer_id.mlp.dense_h_to_4h.qweight
+                        #     module.mlp.dense_h_to_4h.qweight = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.scales$", name)):
+                        #     # model.layers.layer_id.mlp.dense_h_to_4h.scales
+                        #     module.mlp.dense_h_to_4h.scales = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qzeros$", name)):
+                        #     # model.layers.layer_id.mlp.dense_h_to_4h.qzeros
+                        #     module.mlp.dense_h_to_4h.qzeros = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.g_idx$", name)):
+                        #     # model.layers.layer_id.mlp.dense_h_to_4h.g_idx
+                        #     module.mlp.dense_h_to_4h.g_idx = tensor
+                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.bias$", name)):
+                        #     # model.layers.layer_id.mlp.dense_h_to_4h.bias
+                        #     module.mlp.dense_h_to_4h.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_proj\.qweight$", name)):
                             # model.layers.layer_id.mlp.gate_proj.qweight
                             module.mlp.gate_proj.qweight = tensor
@@ -228,62 +365,81 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
                         elif bool(re.match(r"^model.layers\.\d+\.mlp.up_proj\.bias$", name)):
                             # model.layers.layer_id.mlp.up_proj.bias
                             module.mlp.up_proj.bias = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.qweight$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.qweight$", name)):
                             # model.layers.layer_id.mlp.down_proj.qweight
+                            # model.layers.layer_id.mlp.dense_4h_to_h.qweight
                             module.mlp.down_proj.qweight = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.scales$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.scales$", name)):
                             # model.layers.layer_id.mlp.down_proj.scales
+                            # model.layers.layer_id.mlp.dense_4h_to_h.scales
                             module.mlp.down_proj.scales = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.qzeros$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.qzeros$", name)):
                             # model.layers.layer_id.mlp.down_proj.qzeros
+                            # model.layers.layer_id.mlp.dense_4h_to_h.qzeros
                             module.mlp.down_proj.qzeros = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.g_idx$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.g_idx$", name)):
                             # model.layers.layer_id.mlp.down_proj.g_idx
+                            # model.layers.layer_id.mlp.dense_4h_to_h.g_idx
                             module.mlp.down_proj.g_idx = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.bias$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.bias$", name)):
                             # model.layers.layer_id.mlp.down_proj.bias
+                            # model.layers.layer_id.mlp.dense_4h_to_h.bias
                             module.mlp.down_proj.bias = tensor
                         # Match against fused layers
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.qweight$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.qweight$", name)):
                             # model.layers.layer_id.self_attn.qkv_proj.qweight
+                            # model.layers.layer_id.self_attention.query_key_value.qweight
                             q_dim = q_size // (32 // bits) if quant_type == "awq" else q_size
                             kv_dim = kv_size // (32 // bits) if quant_type == "awq" else kv_size
                             module.self_attn.q_proj.qweight = tensor[:, : q_dim]
                             module.self_attn.k_proj.qweight = tensor[:, q_dim : q_dim + kv_dim]
                             module.self_attn.v_proj.qweight = tensor[:, q_dim + kv_dim :]
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.scales$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.scales$", name)):
                             # model.layers.layer_id.self_attn.qkv_proj.scales
+                            # model.layers.layer_id.self_attention.query_key_value.scales
                             module.self_attn.q_proj.scales = tensor[:, : q_size]
                             module.self_attn.k_proj.scales = tensor[:, q_size : q_size + kv_size]
                             module.self_attn.v_proj.scales = tensor[:, q_size + kv_size :]
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.qzeros$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.qzeros$", name)):
                             # model.layers.layer_id.self_attn.qkv_proj.qzeros
+                            # model.layers.layer_id.self_attention.query_key_value.qzeros
                             q_dim = q_size // (32 // bits) if quant_type in {"awq", "gptq"} else q_size
                             kv_dim = kv_size // (32 // bits) if quant_type in {"awq", "gptq"} else kv_size
                             module.self_attn.q_proj.qzeros = tensor[:, : q_dim]
                             module.self_attn.k_proj.qzeros = tensor[:, q_dim : q_dim + kv_dim]
                             module.self_attn.v_proj.qzeros = tensor[:, q_dim + kv_dim :]
-                        elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.g_idx$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.g_idx$", name)):
                             # model.layers.layer_id.self_attn.qkv_proj.g_ix
+                            # model.layers.layer_id.self_attention.query_key_value.g_idx
                             module.self_attn.q_proj.g_idx = tensor
                             module.self_attn.k_proj.g_idx = tensor
                             module.self_attn.v_proj.g_idx = tensor
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.qweight$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.bias$", name)):
+                            # model.layers.layer_id.self_attn.qkv_proj.bias
+                            # model.layers.layer_id.self_attention.query_key_value.bias
+                            module.self_attn.q_proj.bias = tensor[: q_size]
+                            module.self_attn.k_proj.bias = tensor[q_size : q_size + kv_size]
+                            module.self_attn.v_proj.bias = tensor[q_size + kv_size : ]
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.qweight$", name)):
                             # model.layers.layer_id.mlp.gate_up_proj.qweight
+                            # model.layers.layer_id.mlp.dense_h_to_4h.qweight
                             intermediate_dim = intermediate_size // (32 // bits) if quant_type == "awq" else intermediate_size
                             module.mlp.gate_proj.qweight = tensor[:, : intermediate_dim]
                             module.mlp.up_proj.qweight = tensor[:, intermediate_dim :]
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.scales$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.scales$", name)):
                             # model.layers.layer_id.mlp.gate_up_proj.scales
+                            # model.layers.layer_id.mlp.dense_h_to_4h.scales
                             module.mlp.gate_proj.scales = tensor[:, : intermediate_size]
                             module.mlp.up_proj.scales = tensor[:, intermediate_size :]
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.qzeros$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.qzeros$", name)):
                             # model.layers.layer_id.mlp.gate_up_proj.qzeros
+                            # model.layers.layer_id.mlp.dense_h_to_4h.qzeros
                             intermediate_dim = intermediate_size // (32 // bits) if quant_type in {"awq", "gptq"} else intermediate_size
                             module.mlp.gate_proj.qzeros = tensor[:, : intermediate_dim]
                             module.mlp.up_proj.qzeros = tensor[:, intermediate_dim :]
-                        elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.g_idx$", name)):
+                        elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.g_idx$", name)):
                             # model.layers.layer_id.mlp.gate_up_proj.g_idx
+                            # model.layers.layer_id.mlp.dense_h_to_4h.g_idx
                             module.mlp.gate_proj.g_idx = tensor
                             module.mlp.up_proj.g_idx = tensor
                         else:
@@ -295,7 +451,7 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
             self.lm_head.weight = self.embedding.weight
             if self.lm_head.bias is not None:
                 self.lm_head.bias = self.embedding.bias
-    
+
         # Sort list of layers by layer id
         self.layers = list(self.layers.values())
         self.layers.sort(key=lambda m: m.layer_id)
@@ -459,7 +615,7 @@ def dequant_weight(self, module):
         scale_mat = scales[g_idx]
         scale_zeros_mat = scale_zeros[g_idx]
         qdq_weight_T = intweight * scale_mat - scale_zeros_mat.half()
-        
+
         # Store unpacked result in `qweight`
         module.qweight = qdq_weight_T.T
 
@@ -484,8 +640,8 @@ def pack_ort_format(self, module, intweight):
         Pack `scales`, `qzeros`, and `qweight` to ORT format
         """
         if module.bits != 4:
-            raise NotImplementedError(f"{modue.bits}-bit quantization in ORT is not currently supported by this tool.")
-        
+            raise NotImplementedError(f"{module.bits}-bit quantization in ORT is not currently supported by this tool.")
+
         intzeros_pt = module.qzeros.T if module.qzeros.dtype == module.scales.dtype else module.qzeros.T.byte()
         intweight_pt = intweight.byte()
         block_size = module.group_size
@@ -518,8 +674,8 @@ def pack_ort_format(self, module, intweight):
 
 
 class AWQModel(QuantizedModel):
-    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers):
-        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
+    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str):
+        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
 
         # Unpack and repack all `QuantizedTensorModule` classes in model
         for i, layer in enumerate(self.layers):
@@ -528,7 +684,9 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
             print(f"Unpacking and repacking layer {i}")
 
             # Unpack and repack all `QuantizedTensorModule` classes in attention
-            for name, q_tensors in layer.self_attn.__dict__.items():
+            self_attn = getattr(layer, "self_attn", None) or getattr(layer, "self_attention", None)
+            for name, q_tensors in self_attn.__dict__.items():
+                print(name)
                 if isinstance(q_tensors, QuantizedTensorModule) and q_tensors.qweight is not None:
                     self.unpack(q_tensors)
                     self.repack(q_tensors)
@@ -585,8 +743,8 @@ def reverse_reorder_tensor(self, tensor, bits):
 
 
 class GPTQModel(QuantizedModel):
-    def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers):
-        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
+    def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type: str):
+        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
 
         # Unpack and repack all `QuantizedTensorModule` classes in model
         for i, layer in enumerate(self.layers):
@@ -643,17 +801,17 @@ def __init__(self, module):
 
 class QuantModel:
     @staticmethod
-    def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers):
+    def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type:str):
         """
         Unpack quantized weights in PyTorch models, store them in a standard format, and repack them
         into ONNX Runtime's format. Also performs any pre-processing and post-processing when unpacking
         the quantized weights.
         """
         if quant_type == "awq":
-            model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
+            model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
         elif quant_type == "gptq":
-            model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers)
+            model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type)
         else:
             raise NotImplementedError(f"The {quant_type} quantized model is not currently supported.")
 
-        return model
\ No newline at end of file
+        return model

From eb10e51904aebac2eb671d54e5dbf01979b7ae04 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowenbao@amd.com>
Date: Mon, 23 Sep 2024 21:48:49 +0000
Subject: [PATCH 08/17] Cleanup

---
 src/python/py/models/builder.py         | 145 ----------------------
 src/python/py/models/quantized_model.py | 152 +++---------------------
 2 files changed, 15 insertions(+), 282 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 2cc70d1b6..ec019b324 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1684,49 +1684,7 @@ def make_gelu(self, layer_id, root_input, activation):
 
         return gelu_name
 
-    def make_swiglu(self, layer_id, root_input, activation, domain):
-        # Make nodes for this activation subgraph
-        #
-        #       root_input (GateProjMatMul)
-        #            /      \
-        #   split/output_0  split/output_1
-        #         /  |      |
-        #   ActFunc  |      |
-        #          \ |      |
-        #           Mul     |
-        #             \     |
-        #              \    |
-        #                Mul
-        act_name = f"/model/layers.{layer_id}/mlp/act_fn"
-
-        # Split the input into two parts along the last dimension
-        # When using swiglu the MLP projects to 2 times the intermediate_size
-        split_act_name = f"{act_name}/split"
-        num_splits = 2
-        self.make_split(split_act_name, root_input, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size], axis = -1, num_splits=num_splits)
-        split_act_out_name_0 = f"{split_act_name}/output_0"        
-        split_act_out_name_1 = f"{split_act_name}/output_1"   
-
-        act_name = f"{split_act_name}/{activation}"
-        act_func_output = f"{act_name}/output_0"
-        self.make_node(activation, inputs=[split_act_out_name_0], outputs=[act_func_output], name=act_name, domain=domain)
-        self.make_value_info(act_func_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
-
-        mul_act_name_0 = f"{act_name}/Mul_0"
-        mul_act_inputs_0 = [split_act_out_name_0, act_func_output]
-        self.make_mul(mul_act_name_0, mul_act_inputs_0, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
-
-        mul_act_name_1 = f"{act_name}/Mul_1"
-        mul_0_output = f"{mul_act_name_0}/output_0"
-        mul_act_inputs_1 = [split_act_out_name_1, mul_0_output]
-        self.make_mul(mul_act_name_1, mul_act_inputs_1, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size])
-
-        return mul_act_name_1
-
     def make_activation(self, layer_id, root_input):
-
-        # if self.activation in {"swiglu"}:
-        #     output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None)
         if self.activation in {"silu", "swish", "swiglu"}:
             output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None)
         elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}:
@@ -1817,7 +1775,6 @@ def make_model(self, input_path):
                 kv_size,
                 self.intermediate_size,
                 self.num_layers,
-                self.model_type,
             )
         else:
             # Load PyTorch model
@@ -2722,114 +2679,12 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
             super().make_attention_unpacked(layer_id, attention, root_input, **kwargs)
         return super().make_attention(layer_id, attention, root_input, **kwargs)
 
-    # def make_attention(self, layer_id, attention, root_input, **kwargs):
-    #     #Designed from SelfAttention function of medeling_chatglm.py
-    #     hidden_size = self.hidden_size
-    #     num_attention_heads = self.num_attn_heads
-    #     kv_channels = self.kv_channels
-    #     projection_size = kv_channels * num_attention_heads
-    #     hidden_size_per_attention_head = projection_size // num_attention_heads
-    #     multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention"
-    #     multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads
-
-    #     # Reshape the QKV weight
-    #     qkv_weight = attention.query_key_value.weight.T
-
-    #     if multi_query_attention:
-    #         q_weight, k_weight, v_weight = qkv_weight.split(
-    #             [
-    #                 num_attention_heads * hidden_size_per_attention_head,
-    #                 multi_query_group_num * hidden_size_per_attention_head,
-    #                 multi_query_group_num * hidden_size_per_attention_head,
-    #             ],
-    #             dim=-1
-    #         )
-    #     else:
-    #         q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1)
-
-    #     # Reshape the QKV bias if it exists
-    #     if attention.query_key_value.bias is not None:
-    #         qkv_bias = attention.query_key_value.bias
-    #         if multi_query_attention:
-    #             q_bias, k_bias, v_bias = qkv_bias.split(
-    #                 [
-    #                     num_attention_heads * hidden_size_per_attention_head,
-    #                     multi_query_group_num * hidden_size_per_attention_head,
-    #                     multi_query_group_num * hidden_size_per_attention_head,
-    #                 ]
-    #             )
-    #         else:
-    #             q_bias, k_bias, v_bias = qkv_bias.chunk(3)
-    #     else:
-    #         q_bias = k_bias = v_bias = None
-
-    #     # Create separate Q, K, V projections
-    #     attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None)
-    #     attention.q_proj.weight = torch.nn.Parameter(q_weight.T)
-    #     if q_bias is not None:
-    #         attention.q_proj.bias = torch.nn.Parameter(q_bias)
-
-    #     kv_size = multi_query_group_num * hidden_size_per_attention_head
-
-    #     attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None)
-    #     attention.k_proj.weight = torch.nn.Parameter(k_weight.T)
-    #     if k_bias is not None:
-    #         attention.k_proj.bias = torch.nn.Parameter(k_bias)
-
-    #     attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None)
-    #     attention.v_proj.weight = torch.nn.Parameter(v_weight.T)
-    #     if v_bias is not None:
-    #         attention.v_proj.bias = torch.nn.Parameter(v_bias)
-
-    #     # Remove the original combined QKV projection
-    #     del attention.query_key_value
-    #     del qkv_weight
-    #     del qkv_bias
-    #     # Add dummy rotary_emb attribute
-    #     attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})()
-
-    #     super().make_attention(layer_id, attention, root_input, **kwargs)
 
     def make_mlp_proj(self, layer_id, mlp, root_input):
         if self.quant_type is None:
             super().make_mlp_unpacked(layer_id, mlp, root_input)
         super().make_mlp_proj(layer_id, mlp, root_input)
 
-    # def make_mlp_proj(self, layer_id, mlp, root_input):
-    #     # Make nodes for the MLP subgraph
-    #     #
-    #     #           root_input
-    #     #              |
-    #     #         dense_h_to_4h    #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same
-    #     #              |
-    #     #           Activation
-    #     #              |
-    #     #        dense_4h_to_h
-    #     #
-    #     # Make MatMul nodes
-
-    #     up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul"
-    #     up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input)
-    #     # Make activation node(s)
-    #     act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0")
-
-    #     down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul"
-    #     down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0")
-    #     # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
-    #     self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
-
-    # def make_layer(self, layer_id, layer):
-    #     # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model):
-    #     self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
-    #     self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"])
-    #     self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
-    #     self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"])
-
-    #     self.layernorm_attrs["first_layernorm"] = False
-    #     if layer_id == self.num_layers - 1:
-    #         # Norm after last decoder layer of model (last layer --> norm)
-    #         self.layernorm_attrs["last_layernorm"] = True
-
 
 def check_extra_options(kv_pairs):
     if "use_8bits_moe" in kv_pairs:
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
index f2717a3ee..108212952 100644
--- a/src/python/py/models/quantized_model.py
+++ b/src/python/py/models/quantized_model.py
@@ -32,16 +32,6 @@ def __init__(self, bits, group_size):
         self.bits = bits
         self.group_size = group_size
 
-    def set_properties(self, quant_type: str):
-        if quant_type == "awq":
-            self.out_features = self.scales.shape[1]
-            self.in_features = self.qweight.shape[0]
-        elif quant_type == "gptq":
-            self.out_features = self.qweight.shape[1]
-            self.in_features = self.q_proj.g_idx.shape[0]
-        else:
-            raise NotImplementedError(f"The {quant_type} quantization method is not recognized.")
-
     def __str__(self):
         qweight = f"qweight = {self.qweight.shape}, {self.qweight}\n"
         scales = f"scales = {self.scales.shape}, {self.scales}\n"
@@ -70,21 +60,6 @@ def __init__(self, bits, group_size):
         self.o_proj = QuantizedTensorModule(bits, group_size)
         self.rotary_emb = TensorModule()
 
-    def set_properties(self, quant_type: str):
-        self.q_proj.set_properties(quant_type)
-        self.k_proj.set_properties(quant_type)
-        self.v_proj.set_properties(quant_type)
-        self.o_proj.set_properties(quant_type)
-
-class QuantizedChatglm3Attention:
-    def __init__(self, bits, group_size):
-        self.query_key_value = QuantizedTensorModule(bits, group_size)
-        self.dense = QuantizedTensorModule(bits, group_size)
-        self.rotary_emb = TensorModule()
-
-    def set_properties(self, quant_type: str):
-        self.query_key_value.set_properties(quant_type)
-        self.dense.set_properties(quant_type)
 
 class QuantizedMLP:
     def __init__(self, bits, group_size):
@@ -94,21 +69,6 @@ def __init__(self, bits, group_size):
         self.fc1 = QuantizedTensorModule(bits, group_size)
         self.fc2 = QuantizedTensorModule(bits, group_size)
 
-    def set_properties(self, quant_type: str):
-        self.gate_proj.set_properties(quant_type)
-        self.up_proj.set_properties(quant_type)
-        self.down_proj.set_properties(quant_type)
-        self.fc1.set_properties(quant_type)
-        self.fc2.set_properties(quant_type)
-
-class QuantizedChatglm3MLP:
-    def __init__(self, bits, group_size):
-        self.dense_4h_to_h = QuantizedTensorModule(bits, group_size)
-        self.dense_h_to_4h = QuantizedTensorModule(bits, group_size)
-
-    def set_properties(self, quant_type: str):
-        self.dense_4h_to_h.set_properties(quant_type)
-        self.dense_h_to_4h.set_properties(quant_type)
 
 class QuantizedDecoderLayer:
     def __init__(self, layer_id, bits, group_size):
@@ -121,27 +81,9 @@ def __init__(self, layer_id, bits, group_size):
     def is_empty(self):
         return self.input_layernorm.weight is None
 
-    def set_properties(self, quant_type: str):
-        self.self_attn.set_properties(quant_type)
-        self.mlp.set_properties(quant_type)
-
-class QuantizedChatglm3EncoderLayer:
-    def __init__(self, layer_id, bits, group_size):
-        self.layer_id = layer_id
-        self.input_layernorm = TensorModule()
-        self.self_attention = QuantizedChatglm3Attention(bits, group_size)
-        self.post_attention_layernorm = TensorModule()
-        self.mlp = QuantizedChatglm3MLP(bits, group_size)
-
-    def is_empty(self):
-        return self.input_layernorm.weight is None
-
-    def set_properties(self, quant_type: str):
-        self.self_attention.set_properties(quant_type)
-        self.mlp.set_properties(quant_type)
 
 class QuantizedModel:
-    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str):
+    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers):
         self.quant_type = quant_type
         self.embedding = TensorModule()
         self.final_norm = TensorModule()
@@ -149,22 +91,16 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
         self.layers = {}
         self.num_layers = num_layers
 
-        q_layer_cls = QuantizedDecoderLayer
-        # if model_type == "ChatGLMForConditionalGeneration":
-        #     q_layer_cls = QuantizedChatglm3EncoderLayer
-        # print(q_layer_cls)
-
         layer_id = 0
         for weight_file in os.listdir(input_path):
             if weight_file.endswith(".safetensors"):
-                module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size))
+                module = self.layers.setdefault(
+                    layer_id, QuantizedDecoderLayer(layer_id, bits, group_size)
+                )
                 weights = load_file(os.path.join(input_path, weight_file))
-                for name, _ in weights.items():
-                    print(name)
 
                 # Map weights to modules
                 for name, tensor in weights.items():
-                    print(name)
                     if tensor.dtype == torch.bfloat16:
                         # Cast bfloat16 to float32 since NumPy does not support bfloat16
                         tensor = tensor.to(torch.float32)
@@ -190,7 +126,10 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
                         if curr_layer_id != layer_id:
                             # Switch layer module used
                             layer_id = curr_layer_id
-                            module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size))
+                            module = self.layers.setdefault(
+                                layer_id,
+                                QuantizedDecoderLayer(layer_id, bits, group_size),
+                            )
 
                         # Map weights and biases of norm, attention, and feed-forward network
                         # Graph order is input_layernorm --> q_proj/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj
@@ -249,21 +188,6 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
                         elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.bias$", name)):
                             # model.layers.layer_id.self_attn.v_proj.bias
                             module.self_attn.v_proj.bias = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qweight$", name)):
-                        #     # model.layers.layer_id.self_attention.query_key_value.qweight
-                        #     module.self_attention.query_key_value.qweight = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.scales$", name)):
-                        #     # model.layers.layer_id.self_attention.query_key_value.scales
-                        #     module.self_attention.query_key_value.scales = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qzeros$", name)):
-                        #     # model.layers.layer_id.self_attention.query_key_value.qzeros
-                        #     module.self_attention.query_key_value.qzeros = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.g_idx$", name)):
-                        #     # model.layers.layer_id.self_attention.query_key_value.g_idx
-                        #     module.self_attention.query_key_value.g_idx = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.bias$", name)):
-                        #     # model.layers.layer_id.self_attention.query_key_value.bias
-                        #     module.self_attention.query_key_value.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qweight$", name)):
                             # model.layers.layer_id.self_attn.o_proj.qweight
                             # model.layers.layer_id.self_attention.dense.qweight
@@ -284,57 +208,12 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
                             # model.layers.layer_id.self_attn.o_proj.bias
                             # model.layers.layer_id.self_attention.dense.bias
                             module.self_attn.o_proj.bias = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qweight$", name)):
-                        #     # model.layers.layer_id.self_attention.dense.qweight
-                        #     module.self_attention.dense.qweight = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.scales$", name)):
-                        #     # model.layers.layer_id.self_attention.dense.scales
-                        #     module.self_attention.dense.scales = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qzeros$", name)):
-                        #     # model.layers.layer_id.self_attention.dense.qzeros
-                        #     module.self_attention.dense.qzeros = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.g_idx$", name)):
-                        #     # model.layers.layer_id.self_attention.dense.g_idx
-                        #     module.self_attention.dense.g_idx = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.bias$", name)):
-                        #     # model.layers.layer_id.self_attention.dense.bias
-                        #     module.self_attention.dense.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.weight$", name)):
                             # model.layers.layer_id.post_attention_layernorm.weight
                             module.post_attention_layernorm.weight = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.bias$", name)):
                             # model.layers.layer_id.post_attention_layernorm.bias
                             module.post_attention_layernorm.bias = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qweight$", name)):
-                        #     # model.layers.layer_id.mlp.dense_4h_to_h.qweight
-                        #     module.mlp.dense_4h_to_h.qweight = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.scales$", name)):
-                        #     # model.layers.layer_id.mlp.dense_4h_to_h.scales
-                        #     module.mlp.dense_4h_to_h.scales = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qzeros$", name)):
-                        #     # model.layers.layer_id.mlp.dense_4h_to_h.qzeros
-                        #     module.mlp.dense_4h_to_h.qzeros = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.g_idx$", name)):
-                        #     # model.layers.layer_id.mlp.dense_4h_to_h.g_idx
-                        #     module.mlp.dense_4h_to_h.g_idx = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.bias$", name)):
-                        #     # model.layers.layer_id.mlp.dense_4h_to_h.bias
-                        #     module.mlp.dense_4h_to_h.bias = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qweight$", name)):
-                        #     # model.layers.layer_id.mlp.dense_h_to_4h.qweight
-                        #     module.mlp.dense_h_to_4h.qweight = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.scales$", name)):
-                        #     # model.layers.layer_id.mlp.dense_h_to_4h.scales
-                        #     module.mlp.dense_h_to_4h.scales = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qzeros$", name)):
-                        #     # model.layers.layer_id.mlp.dense_h_to_4h.qzeros
-                        #     module.mlp.dense_h_to_4h.qzeros = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.g_idx$", name)):
-                        #     # model.layers.layer_id.mlp.dense_h_to_4h.g_idx
-                        #     module.mlp.dense_h_to_4h.g_idx = tensor
-                        # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.bias$", name)):
-                        #     # model.layers.layer_id.mlp.dense_h_to_4h.bias
-                        #     module.mlp.dense_h_to_4h.bias = tensor
                         elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_proj\.qweight$", name)):
                             # model.layers.layer_id.mlp.gate_proj.qweight
                             module.mlp.gate_proj.qweight = tensor
@@ -674,8 +553,8 @@ def pack_ort_format(self, module, intweight):
 
 
 class AWQModel(QuantizedModel):
-    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str):
-        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
+    def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers):
+        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
 
         # Unpack and repack all `QuantizedTensorModule` classes in model
         for i, layer in enumerate(self.layers):
@@ -686,7 +565,6 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in
             # Unpack and repack all `QuantizedTensorModule` classes in attention
             self_attn = getattr(layer, "self_attn", None) or getattr(layer, "self_attention", None)
             for name, q_tensors in self_attn.__dict__.items():
-                print(name)
                 if isinstance(q_tensors, QuantizedTensorModule) and q_tensors.qweight is not None:
                     self.unpack(q_tensors)
                     self.repack(q_tensors)
@@ -743,8 +621,8 @@ def reverse_reorder_tensor(self, tensor, bits):
 
 
 class GPTQModel(QuantizedModel):
-    def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type: str):
-        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
+    def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers):
+        super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
 
         # Unpack and repack all `QuantizedTensorModule` classes in model
         for i, layer in enumerate(self.layers):
@@ -801,16 +679,16 @@ def __init__(self, module):
 
 class QuantModel:
     @staticmethod
-    def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type:str):
+    def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers):
         """
         Unpack quantized weights in PyTorch models, store them in a standard format, and repack them
         into ONNX Runtime's format. Also performs any pre-processing and post-processing when unpacking
         the quantized weights.
         """
         if quant_type == "awq":
-            model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type)
+            model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers)
         elif quant_type == "gptq":
-            model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type)
+            model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers)
         else:
             raise NotImplementedError(f"The {quant_type} quantized model is not currently supported.")
 

From 938535cc3b4b6164bd884fe3483855c4ffaf94e1 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 00:47:10 +0000
Subject: [PATCH 09/17] minor_updates

---
 src/python/py/models/builder.py         | 6 ++++++
 src/python/py/models/quantized_model.py | 1 +
 2 files changed, 7 insertions(+)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index ec019b324..448c2308a 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1,5 +1,6 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
+# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
@@ -2677,6 +2678,8 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
     def make_attention(self, layer_id, attention, root_input, **kwargs):
         if self.quant_type is None:
             super().make_attention_unpacked(layer_id, attention, root_input, **kwargs)
+            # Add dummy rotary_emb attribute
+            attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})()
         return super().make_attention(layer_id, attention, root_input, **kwargs)
 
 
@@ -2685,6 +2688,9 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
             super().make_mlp_unpacked(layer_id, mlp, root_input)
         super().make_mlp_proj(layer_id, mlp, root_input)
 
+    def make_layer(self, layer_id, layer):
+        layer.self_attn = layer.self_attn if hasattr(layer, 'self_attn') else layer.self_attention
+        super().make_layer(layer_id, layer)
 
 def check_extra_options(kv_pairs):
     if "use_8bits_moe" in kv_pairs:
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
index 108212952..9afe5a6eb 100644
--- a/src/python/py/models/quantized_model.py
+++ b/src/python/py/models/quantized_model.py
@@ -1,5 +1,6 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
+# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------

From 5e39727d3e874cc03586d8a6d4a5937a542c36f9 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 00:49:07 +0000
Subject: [PATCH 10/17] comment

---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 448c2308a..30710d374 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -2761,7 +2761,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
-        elif config.architectures[0] == "ChatGLMForConditionalGeneration":
+        elif config.architectures[0] == "ChatGLMForConditionalGeneration" or config.architectures[0] == "ChatGLMModel":
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:

From cfac49c5100639945ca972f19af2badde6df8778 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 01:39:16 +0000
Subject: [PATCH 11/17] correct_usernames

---
 src/python/py/models/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 30710d374..c84ce8d88 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -2762,6 +2762,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
         elif config.architectures[0] == "Qwen2ForCausalLM":
             onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "ChatGLMForConditionalGeneration" or config.architectures[0] == "ChatGLMModel":
+            # Quantized ChatGLM model has ChatGLMForConditionalGeneration as architecture whereas HF model as the latter
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:

From e70dcfb6dbcf4cc039ce759efd5097133de56aac Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 21:58:02 +0000
Subject: [PATCH 12/17] cleanup

---
 src/python/py/models/builder.py         | 4 ++--
 src/python/py/models/quantized_model.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index c84ce8d88..89163d544 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1,9 +1,10 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
-# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
+
 """
 Run this script to create the desired ONNX model.
 """
@@ -32,7 +33,6 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads
         self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads
         self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False
-        # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245
         self.num_attn_heads = config.num_attention_heads
         self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
index 9afe5a6eb..1ee85322e 100644
--- a/src/python/py/models/quantized_model.py
+++ b/src/python/py/models/quantized_model.py
@@ -1,9 +1,9 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
-# Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+# Licensed under the MIT License.  See License.txt in the project root for
 """
 A set of Python classes to unpack the quantized weights and repack them in ONNX Runtime's
 standard format.

From 44a5178b797b7a89ee853128170fdd5e39b0e142 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 22:35:31 +0000
Subject: [PATCH 13/17] fixed_license_headers

---
 src/python/py/models/builder.py         | 1 -
 src/python/py/models/quantized_model.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 89163d544..89f89897a 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -4,7 +4,6 @@
 # license information.
 # --------------------------------------------------------------------------
 # Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
-
 """
 Run this script to create the desired ONNX model.
 """
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
index 1ee85322e..2d15cb5fa 100644
--- a/src/python/py/models/quantized_model.py
+++ b/src/python/py/models/quantized_model.py
@@ -1,9 +1,9 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
-# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
+# Licensed under the MIT License.  See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
-# Licensed under the MIT License.  See License.txt in the project root for
+# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved
 """
 A set of Python classes to unpack the quantized weights and repack them in ONNX Runtime's
 standard format.

From d8eb982686cf2ff285ad94de24a19434663fec38 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Tue, 24 Sep 2024 22:51:31 +0000
Subject: [PATCH 14/17] rm_unused_make_split

---
 src/python/py/models/builder.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 89f89897a..b31f9425c 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -530,13 +530,6 @@ def make_gather(self, name, inputs, axis):
         self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis)
         self.make_value_info(output, TensorProto.INT64, shape=[])
 
-    def make_split(self, name, inputs, dtype, shape, axis, num_splits):
-        # Splits the input tensor into num_splits based on the axis and shape
-        outputs = [f"{name}/output_{i}" for i in range(num_splits)]
-        self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis)
-        for output in outputs:
-            self.make_value_info(output, dtype, shape=shape)
-
     def make_reshape(self, name, inputs, dtype, shape):
         output = f"{name}/output_0"
         self.make_node("Reshape", inputs=inputs, outputs=[output], name=name)

From e116fa4ee47f041f6590276ffee78f518d221747 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 26 Sep 2024 17:17:36 +0000
Subject: [PATCH 15/17] refactor_config

---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index b31f9425c..02c0799a6 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -24,7 +24,7 @@
 class Model:
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): 
         self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length
-        self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length
+        self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length
         self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1  # default is -1 in GroupQueryAttention kernel
         self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size
         self.hidden_size = config.hidden_size

From f76867300d35a9e3ee66a2779b965700ea256e73 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Thu, 26 Sep 2024 18:30:26 +0000
Subject: [PATCH 16/17] refactor_context_length_config

---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 02c0799a6..0b6272bfc 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -23,7 +23,7 @@
 
 class Model:
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): 
-        self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length
+        self.context_length = config.seq_length if hasattr(config, "seq_length") else config.max_position_embeddings
         self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length
         self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1  # default is -1 in GroupQueryAttention kernel
         self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size

From 7eee24d67b2a0c42a0c8fec6783c1e72e6072918 Mon Sep 17 00:00:00 2001
From: amd-sudo-sh <Sudarshan.Sharma@amd.com>
Date: Fri, 27 Sep 2024 16:39:36 +0000
Subject: [PATCH 17/17] cleanup_based_on_comments

---
 src/python/py/models/builder.py | 44 +++++++++------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 0b6272bfc..c20db0e3d 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -26,12 +26,9 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.context_length = config.seq_length if hasattr(config, "seq_length") else config.max_position_embeddings
         self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length
         self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1  # default is -1 in GroupQueryAttention kernel
-        self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size
+        self.intermediate_size = config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size
         self.hidden_size = config.hidden_size
-        self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads
-        self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads
-        self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads
-        self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False
+        self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.multi_query_group_num if hasattr(config, "multi_query_group_num") else config.num_attention_heads
         self.num_attn_heads = config.num_attention_heads
         self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers
@@ -1789,21 +1786,17 @@ def make_model(self, input_path):
                     self.layernorm_attrs["root_input"] = "inputs_embeds"
                     self.layernorm_attrs["skip_input"] = "inputs_embeds"
 
-            elif module.__class__.__name__.endswith("DecoderLayer") and self.layer_id < self.num_layers:
+            elif module.__class__.__name__.endswith("DecoderLayer") or module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers:
                 # Each decoder layer of model
                 print(f"Reading decoder layer {self.layer_id}")
                 self.make_layer(self.layer_id, module)
                 self.layer_id += 1
 
-            elif module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers:
-                print(f"Reading decoder layer {self.layer_id}")
-                self.make_layer(self.layer_id, module)
-                self.layer_id += 1
-
             elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
                 self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
+            
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
                 if not self.exclude_lm_head:
@@ -1814,12 +1807,13 @@ def make_model(self, input_path):
         del model
 
     def has_final_norm(self, module, model):
-        # Hugging Face names
-        hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
-        hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
-        # GGUF names
-        gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
-        return hf_norm or hf_final_layernorm or gguf_final_norm
+       # Hugging Face names
+       hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
+       hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
+       hf_transformer_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
+       # GGUF names
+       gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
+       return hf_norm or hf_final_layernorm or hf_transformer_final_layernorm or gguf_final_norm
 
     def make_preprocessing_nodes(self):
         self.make_attention_mask_reformatting()
@@ -2645,24 +2639,12 @@ def make_layer(self, layer_id, layer):
 class ChatGLMModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
-        self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
         self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration
         self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
         self.rotemb_attrs["interleaved"] = 1
-        self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False
         self.attention_attrs["use_rotemb_in_attn"] = True
         self.attention_attrs["use_packed_matmul"] = True
-        self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"]
-
-    def has_final_norm(self, module, model):
-        # Hugging Face names
-        hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
-        hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
-        # GGUF names
-        gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
-        return hf_norm or hf_final_layernorm or gguf_final_norm
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
@@ -2758,9 +2740,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         else:
-            raise NotImplementedError(
-                f"The {hf_name} model is not currently supported. Got {config}"
-            )
+            raise NotImplementedError(f"The {hf_name} model is not currently supported.")
 
         # Make ONNX model
         onnx_model.make_model(input_path)