From d1b26a29355aa0f144563d8b885b51a595eb2f9e Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 12 Sep 2024 20:52:27 +0000 Subject: [PATCH 01/17] builder_support_for_chatglm_1 --- src/python/py/models/ChatGLM_modules.log | 128 ++++++++++++ src/python/py/models/builder.py | 236 ++++++++++++++++++++++- 2 files changed, 359 insertions(+), 5 deletions(-) create mode 100644 src/python/py/models/ChatGLM_modules.log diff --git a/src/python/py/models/ChatGLM_modules.log b/src/python/py/models/ChatGLM_modules.log new file mode 100644 index 000000000..03732df1b --- /dev/null +++ b/src/python/py/models/ChatGLM_modules.log @@ -0,0 +1,128 @@ +Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, INT4 CPU, INT4 CUDA, INT4 DML +Extra options: {} +GroupQueryAttention (GQA) is used in this model. +########## +ChatGLMForConditionalGeneration( + (transformer): ChatGLMModel( + (embedding): Embedding( + (word_embeddings): Embedding(65024, 4096) + ) + (rotary_pos_emb): RotaryEmbedding() + (encoder): GLMTransformer( + (layers): ModuleList( + (0-27): 28 x GLMBlock( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) + (core_attention): CoreAttention( + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (dense): Linear(in_features=4096, out_features=4096, bias=False) + ) + (post_attention_layernorm): RMSNorm() + (mlp): MLP( + (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) + (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): Linear(in_features=4096, out_features=65024, bias=False) + ) +) +########## +ChatGLMModel( + (embedding): Embedding( + (word_embeddings): Embedding(65024, 4096) + ) + (rotary_pos_emb): RotaryEmbedding() + (encoder): GLMTransformer( + (layers): ModuleList( + (0-27): 28 x GLMBlock( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) + (core_attention): CoreAttention( + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (dense): Linear(in_features=4096, out_features=4096, bias=False) + ) + (post_attention_layernorm): RMSNorm() + (mlp): MLP( + (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) + (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) + ) + ) + ) + (final_layernorm): RMSNorm() + ) + (output_layer): Linear(in_features=4096, out_features=65024, bias=False) +) +########## +Embedding( + (word_embeddings): Embedding(65024, 4096) +) +########## +Embedding(65024, 4096) +Reading embedding layer +########## +RotaryEmbedding() +########## +GLMTransformer( + (layers): ModuleList( + (0-27): 28 x GLMBlock( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) + (core_attention): CoreAttention( + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (dense): Linear(in_features=4096, out_features=4096, bias=False) + ) + (post_attention_layernorm): RMSNorm() + (mlp): MLP( + (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) + (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) + ) + ) + ) + (final_layernorm): RMSNorm() +) +########## +ModuleList( + (0-27): 28 x GLMBlock( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) + (core_attention): CoreAttention( + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (dense): Linear(in_features=4096, out_features=4096, bias=False) + ) + (post_attention_layernorm): RMSNorm() + (mlp): MLP( + (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) + (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) + ) + ) +) +########## +GLMBlock( + (input_layernorm): RMSNorm() + (self_attention): SelfAttention( + (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) + (core_attention): CoreAttention( + (attention_dropout): Dropout(p=0.0, inplace=False) + ) + (dense): Linear(in_features=4096, out_features=4096, bias=False) + ) + (post_attention_layernorm): RMSNorm() + (mlp): MLP( + (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) + (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) + ) +) +Reading decoder layer 0 +['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_allocate_memory', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_is_hf_initialized', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_version', '_wrapped_call_impl', 'add_module', 'apply', 'bfloat16', 'buffers', 'call_super_init', 'children', 'compile', 'core_attention', 'cpu', 'cuda', 'dense', 'double', 'dump_patches', 'eval', 'extra_repr', 'float', 'forward', 'get_buffer', 'get_extra_state', 'get_parameter', 'get_submodule', 'half', 'hidden_size_per_attention_head', 'ipu', 'layer_number', 'load_state_dict', 'modules', 'mtia', 'multi_query_attention', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_attention_heads_per_partition', 'num_multi_query_groups_per_partition', 'parameters', 'projection_size', 'qkv_hidden_size', 'query_key_value', 'register_backward_hook', 'register_buffer', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'set_extra_state', 'set_submodule', 'share_memory', 'state_dict', 'to', 'to_empty', 'train', 'training', 'type', 'xpu', 'zero_grad'] +True diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 1a845cd7c..ee9d4ee41 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -28,6 +28,10 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.intermediate_size = config.intermediate_size self.hidden_size = config.hidden_size self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads + self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads + self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads + self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False + # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245 self.num_attn_heads = config.num_attention_heads self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers @@ -526,6 +530,14 @@ def make_gather(self, name, inputs, axis): self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis) self.make_value_info(output, TensorProto.INT64, shape=[]) + def make_split(self, name, inputs, dtype, shape, axis, num_splits): + # Splits the input tensor into num_splits based on the axis + outputs = [f"{name}/output_{i}" for i in range(num_splits)] + split = [num_splits for i in range(num_splits)] + self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis, split = split) + for output in outputs: + self.make_value_info(output, dtype, shape=shape) + def make_reshape(self, name, inputs, dtype, shape): output = f"{name}/output_0" self.make_node("Reshape", inputs=inputs, outputs=[output], name=name) @@ -1663,8 +1675,51 @@ def make_gelu(self, layer_id, root_input, activation): return gelu_name + def make_swiglu(self, layer_id, root_input, activation, domain): + # Make nodes for this activation subgraph + # + # root_input (GateProjMatMul) + # / \ + # split/output_0 split/output_1 + # / | | + # ActFunc | | + # \ | | + # Mul | + # \ | + # \ | + # Mul + act_name = f"/model/layers.{layer_id}/mlp/act_fn" + + # Split the input into two parts along the last dimension + # When using swiglu the MLP projects to 2 times the intermediate_size + split_act_name = f"{act_name}/split" + num_splits = 2 + self.make_split(split_act_name, root_input, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size], axis = -1, num_splits=num_splits) + split_act_out_name_0 = f"{split_act_name}/output_0" + split_act_out_name_1 = f"{split_act_name}/output_1" + + act_name = f"{split_act_name}/{activation}" + act_func_output = f"{act_name}/output_0" + self.make_node(activation, inputs=[split_act_out_name_0], outputs=[act_func_output], name=act_name, domain=domain) + self.make_value_info(act_func_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + + mul_act_name_0 = f"{act_name}/Mul_0" + mul_act_inputs_0 = [split_act_out_name_0, act_func_output] + self.make_mul(mul_act_name_0, mul_act_inputs_0, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + + mul_act_name_1 = f"{act_name}/Mul_1" + mul_0_output = f"{mul_act_name_0}/output_0" + mul_act_inputs_1 = [split_act_out_name_1, mul_0_output] + self.make_mul(mul_act_name_1, mul_act_inputs_1, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + + return mul_act_name_1 + + def make_activation(self, layer_id, root_input): - if self.activation in {"silu", "swish"}: + + if self.activation in {"swiglu"}: + output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None) + elif self.activation in {"silu", "swish"}: output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None) elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}: output_name = self.make_gelu(layer_id, root_input, activation="FastGelu") @@ -1752,7 +1807,10 @@ def make_model(self, input_path): # Loop through model and map each module to ONNX/ORT ops self.layer_id = 0 + model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else "" for module in model.modules(): + print("##########") + print(module) if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_embeds: @@ -1770,10 +1828,15 @@ def make_model(self, input_path): self.make_layer(self.layer_id, module) self.layer_id += 1 - elif self.layer_id == self.num_layers and self.has_final_norm(module, model): + elif module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers: + print(f"Reading decoder layer {self.layer_id}") + self.make_layer(self.layer_id, module) + self.layer_id += 1 + + elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") - self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") + # self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) @@ -1784,10 +1847,14 @@ def make_model(self, input_path): del model - def has_final_norm(self, module, model): + def has_final_norm(self, module, model, model_name): # Hugging Face names hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm + if(model_name == "ChatGLM"): + hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm + else: + hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm + # GGUF names gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm return hf_norm or hf_final_layernorm or gguf_final_norm @@ -2613,6 +2680,158 @@ def make_layer(self, layer_id, layer): self.layernorm_attrs["last_layernorm"] = True +class ChatGLMModel(Model): + def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): + super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) + # self.input_shapes["position_ids"] = [1] # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor + # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms + self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM + self.rotemb_attrs["num_heads"] = self.num_attn_heads + self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) + self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False + self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"] + + def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): + super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) + + + def make_attention(self, layer_id, attention, root_input, **kwargs): + #Designed from SelfAttention function of medeling_chatglm.py + hidden_size = self.hidden_size + num_attention_heads = self.num_attn_heads + kv_channels = self.kv_channels + head_size = self.head_size + + projection_size = kv_channels * num_attention_heads + hidden_size_per_attention_head = projection_size // num_attention_heads + + multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" + multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads + + if multi_query_attention: + qkv_hidden_size = projection_size + 2 * hidden_size_per_attention_head * multi_query_group_num + else: + qkv_hidden_size = 3 * projection_size + + # Reshape the QKV weight + qkv_weight = attention.query_key_value.weight.T + + if multi_query_attention: + q_weight, k_weight, v_weight = qkv_weight.split( + [ + num_attention_heads * hidden_size_per_attention_head, + multi_query_group_num * hidden_size_per_attention_head, + multi_query_group_num * hidden_size_per_attention_head, + ], + dim=1 + ) + else: + q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=1) + + # Reshape the QKV bias if it exists + if attention.query_key_value.bias is not None: + qkv_bias = attention.query_key_value.bias + if multi_query_attention: + q_bias, k_bias, v_bias = qkv_bias.split( + [ + num_attention_heads * hidden_size_per_attention_head, + multi_query_group_num * hidden_size_per_attention_head, + multi_query_group_num * hidden_size_per_attention_head, + ] + ) + else: + q_bias, k_bias, v_bias = qkv_bias.chunk(3) + else: + q_bias = k_bias = v_bias = None + + # Create separate Q, K, V projections + attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None) + attention.q_proj.weight = torch.nn.Parameter(q_weight.T) + if q_bias is not None: + attention.q_proj.bias = torch.nn.Parameter(q_bias) + + kv_size = multi_query_group_num * hidden_size_per_attention_head + + attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None) + attention.k_proj.weight = torch.nn.Parameter(k_weight.T) + if k_bias is not None: + attention.k_proj.bias = torch.nn.Parameter(k_bias) + + attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None) + attention.v_proj.weight = torch.nn.Parameter(v_weight.T) + if v_bias is not None: + attention.v_proj.bias = torch.nn.Parameter(v_bias) + + # Remove the original combined QKV projection + del attention.query_key_value + del qkv_weight + del qkv_bias + # Add dummy rotary_emb attribute + attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})() + + super().make_attention(layer_id, attention, root_input, **kwargs) + + def make_mlp_proj(self, layer_id, mlp, root_input): + # Make nodes for the MLP subgraph + # + # root_input + # | + # dense_h_to_4h + # | + # Activation + # | + # dense_4h_to_h + # Make MatMul nodes + # gate_basename = f"/model/layers.{layer_id}/mlp/gate_proj/MatMul" + # gate_name = self.make_matmul(mlp.gate_proj, gate_basename, root_input) + up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" + up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input) + # Make activation node(s) + act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0") + # # Make Mul node after activation + # mul_name = f"/model/layers.{layer_id}/mlp/Mul" + # mul_inputs = [f"{act_fn_name}/output_0", f"{up_name}/output_0"] + # self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) + # Make output MatMul node + down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul" + down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0") + # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm + self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" + + + def make_layer(self, layer_id, layer): + # Each GLM encoder is defined as follows all LayerNorms are RMSNorms: + # input_layernorm --> self_attention --> residual_add_pre_input_layernorm --> layernorm --> dense --> residual_add_pre_last_layernorm + #TODO: @amd-sudo-sh Add the conditional statement for different residual configuration. + root_input = self.layernorm_attrs["root_input"] + self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="input") + self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"]) + + residual_add_name_0 = f"/model/layers.{layer_id}/residual_add/Add_0" + residual_add_inputs_0 = [self.layernorm_attrs['skip_input'], root_input] + next_residual_input = self.layernorm_attrs['skip_input'] + self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + + self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0" + self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="middle") + + self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input'] + + residual_add_name_1 = f"/model/layers.{layer_id}/residual_add/Add_1" + residual_add_inputs_1 = [self.layernorm_attrs['skip_input'], next_residual_input] + self.make_add(residual_add_name_1, residual_add_inputs_1, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + + self.layernorm_attrs["root_input"] = f"{residual_add_name_1}/output_0" + + if layer_id == self.num_layers - 1: + # Norm after last decoder layer of model (last layer --> norm) + self.layernorm_attrs["last_layernorm"] = True + + # Assign output 0 of residual Add as skip input to next SkipLayerNorm + self.layernorm_attrs["skip_input"] = f"{residual_add_name_1}/output_0" + + + def check_extra_options(kv_pairs): if "use_8bits_moe" in kv_pairs: assert(kv_pairs["use_8bits_moe"] == "1" or kv_pairs["use_8bits_moe"] == "0"), "use_8bits_moe must be 0 or 1." @@ -2682,6 +2901,13 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) + elif config.architectures[0] == "ChatGLMModel": + #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way + config.max_position_embeddings = config.seq_length # Max sequence length a model can handle + config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer + config.num_hidden_layers = config.num_layers + config.hidden_act = "swiglu" + onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: raise NotImplementedError(f"The {hf_name} model is not currently supported.") From a51d036e62d637d70661769064065aa71a24e3d9 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Fri, 13 Sep 2024 21:05:37 +0000 Subject: [PATCH 02/17] builder_changes --- src/python/py/models/builder.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index ee9d4ee41..aab232e4b 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -531,10 +531,11 @@ def make_gather(self, name, inputs, axis): self.make_value_info(output, TensorProto.INT64, shape=[]) def make_split(self, name, inputs, dtype, shape, axis, num_splits): + #TODO: @amd-sudo-sh: Currently it supports num_splits = 2 # Splits the input tensor into num_splits based on the axis outputs = [f"{name}/output_{i}" for i in range(num_splits)] - split = [num_splits for i in range(num_splits)] - self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis, split = split) + # split = [num_splits for i in range(num_splits)] + self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis) for output in outputs: self.make_value_info(output, dtype, shape=shape) @@ -1836,12 +1837,15 @@ def make_model(self, input_path): elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") - # self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") - + print(self.layernorm_attrs["root_input"]) + self.make_layernorm(self.layer_id, module, skip=False, simple=self.layernorm_attrs["simple"], location="final_norm") + self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"] + print(self.layernorm_attrs["root_input"]) elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: # Language modeling head (SkipLayerNorm --> logits) + print(self.layernorm_attrs["root_input"]) print("Reading LM head") self.make_lm_head(module) @@ -2687,8 +2691,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM self.rotemb_attrs["num_heads"] = self.num_attn_heads + self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False + self.attention_attrs["use_rotemb_in_attn"] = True + self.attention_attrs["use_packed_matmul"] = True self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"] def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): @@ -2700,7 +2707,7 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): hidden_size = self.hidden_size num_attention_heads = self.num_attn_heads kv_channels = self.kv_channels - head_size = self.head_size + # head_size = self.head_size projection_size = kv_channels * num_attention_heads hidden_size_per_attention_head = projection_size // num_attention_heads @@ -2723,10 +2730,10 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): multi_query_group_num * hidden_size_per_attention_head, multi_query_group_num * hidden_size_per_attention_head, ], - dim=1 + dim=-1 ) else: - q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=1) + q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1) # Reshape the QKV bias if it exists if attention.query_key_value.bias is not None: @@ -2782,17 +2789,12 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # | # dense_4h_to_h # Make MatMul nodes - # gate_basename = f"/model/layers.{layer_id}/mlp/gate_proj/MatMul" - # gate_name = self.make_matmul(mlp.gate_proj, gate_basename, root_input) + up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input) # Make activation node(s) act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0") - # # Make Mul node after activation - # mul_name = f"/model/layers.{layer_id}/mlp/Mul" - # mul_inputs = [f"{act_fn_name}/output_0", f"{up_name}/output_0"] - # self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) - # Make output MatMul node + down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul" down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0") # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm @@ -2813,7 +2815,7 @@ def make_layer(self, layer_id, layer): self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0" - self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="middle") + self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_attention") self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input'] From bc46b1cac5a1fc06dc16fa461aa27bb720992364 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Wed, 18 Sep 2024 04:24:56 +0000 Subject: [PATCH 03/17] parity_checked_chatglm_model --- src/python/py/models/builder.py | 51 +++++++++++---------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index aab232e4b..8626e0986 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1810,13 +1810,13 @@ def make_model(self, input_path): self.layer_id = 0 model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else "" for module in model.modules(): - print("##########") - print(module) + # print("##########") + # print(module) if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_embeds: # Embedding layer - print("Reading embedding layer") + # print("Reading embedding layer") self.make_embedding(module.weight.detach().numpy()) else: # Exclude embedding layer from model @@ -1837,15 +1837,15 @@ def make_model(self, input_path): elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") - print(self.layernorm_attrs["root_input"]) - self.make_layernorm(self.layer_id, module, skip=False, simple=self.layernorm_attrs["simple"], location="final_norm") - self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"] - print(self.layernorm_attrs["root_input"]) + # print(self.layernorm_attrs["root_input"]) + self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") + # self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"] + # print(self.layernorm_attrs["root_input"]) elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: # Language modeling head (SkipLayerNorm --> logits) - print(self.layernorm_attrs["root_input"]) + # print(self.layernorm_attrs["root_input"]) print("Reading LM head") self.make_lm_head(module) @@ -2688,11 +2688,11 @@ class ChatGLMModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) # self.input_shapes["position_ids"] = [1] # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor - # self.layernorm_attrs["first_layernorm"] = False # Manually use Residuals to no SkipLayerNorms self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM self.rotemb_attrs["num_heads"] = self.num_attn_heads self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) + self.rotemb_attrs["interleaved"] = 1 self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False self.attention_attrs["use_rotemb_in_attn"] = True self.attention_attrs["use_packed_matmul"] = True @@ -2783,11 +2783,12 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # # root_input # | - # dense_h_to_4h + # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h # | # Activation # | # dense_4h_to_h + # # Make MatMul nodes up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" @@ -2802,37 +2803,17 @@ def make_mlp_proj(self, layer_id, mlp, root_input): def make_layer(self, layer_id, layer): - # Each GLM encoder is defined as follows all LayerNorms are RMSNorms: - # input_layernorm --> self_attention --> residual_add_pre_input_layernorm --> layernorm --> dense --> residual_add_pre_last_layernorm - #TODO: @amd-sudo-sh Add the conditional statement for different residual configuration. - root_input = self.layernorm_attrs["root_input"] - self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="input") + # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model): + self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"]) - - residual_add_name_0 = f"/model/layers.{layer_id}/residual_add/Add_0" - residual_add_inputs_0 = [self.layernorm_attrs['skip_input'], root_input] - next_residual_input = self.layernorm_attrs['skip_input'] - self.make_add(residual_add_name_0, residual_add_inputs_0, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) - - self.layernorm_attrs["root_input"] = f"{residual_add_name_0}/output_0" - self.make_layernorm(layer_id, layer.input_layernorm, skip=False, simple=self.layernorm_attrs["simple"], location="post_attention") - - self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) #modifies the self.layernorm_attrs['skip_input'] - - residual_add_name_1 = f"/model/layers.{layer_id}/residual_add/Add_1" - residual_add_inputs_1 = [self.layernorm_attrs['skip_input'], next_residual_input] - self.make_add(residual_add_name_1, residual_add_inputs_1, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) + self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) - self.layernorm_attrs["root_input"] = f"{residual_add_name_1}/output_0" - + self.layernorm_attrs["first_layernorm"] = False if layer_id == self.num_layers - 1: # Norm after last decoder layer of model (last layer --> norm) self.layernorm_attrs["last_layernorm"] = True - # Assign output 0 of residual Add as skip input to next SkipLayerNorm - self.layernorm_attrs["skip_input"] = f"{residual_add_name_1}/output_0" - - def check_extra_options(kv_pairs): if "use_8bits_moe" in kv_pairs: From 1c90219df0dc16d68ec1050ab30c948826da6f51 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 19 Sep 2024 17:40:21 +0000 Subject: [PATCH 04/17] refractor_codebase --- src/python/py/models/ChatGLM_modules.log | 128 ----------------------- src/python/py/models/builder.py | 56 +++++----- 2 files changed, 26 insertions(+), 158 deletions(-) delete mode 100644 src/python/py/models/ChatGLM_modules.log diff --git a/src/python/py/models/ChatGLM_modules.log b/src/python/py/models/ChatGLM_modules.log deleted file mode 100644 index 03732df1b..000000000 --- a/src/python/py/models/ChatGLM_modules.log +++ /dev/null @@ -1,128 +0,0 @@ -Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, INT4 CPU, INT4 CUDA, INT4 DML -Extra options: {} -GroupQueryAttention (GQA) is used in this model. -########## -ChatGLMForConditionalGeneration( - (transformer): ChatGLMModel( - (embedding): Embedding( - (word_embeddings): Embedding(65024, 4096) - ) - (rotary_pos_emb): RotaryEmbedding() - (encoder): GLMTransformer( - (layers): ModuleList( - (0-27): 28 x GLMBlock( - (input_layernorm): RMSNorm() - (self_attention): SelfAttention( - (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) - (core_attention): CoreAttention( - (attention_dropout): Dropout(p=0.0, inplace=False) - ) - (dense): Linear(in_features=4096, out_features=4096, bias=False) - ) - (post_attention_layernorm): RMSNorm() - (mlp): MLP( - (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) - (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) - ) - ) - ) - (final_layernorm): RMSNorm() - ) - (output_layer): Linear(in_features=4096, out_features=65024, bias=False) - ) -) -########## -ChatGLMModel( - (embedding): Embedding( - (word_embeddings): Embedding(65024, 4096) - ) - (rotary_pos_emb): RotaryEmbedding() - (encoder): GLMTransformer( - (layers): ModuleList( - (0-27): 28 x GLMBlock( - (input_layernorm): RMSNorm() - (self_attention): SelfAttention( - (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) - (core_attention): CoreAttention( - (attention_dropout): Dropout(p=0.0, inplace=False) - ) - (dense): Linear(in_features=4096, out_features=4096, bias=False) - ) - (post_attention_layernorm): RMSNorm() - (mlp): MLP( - (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) - (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) - ) - ) - ) - (final_layernorm): RMSNorm() - ) - (output_layer): Linear(in_features=4096, out_features=65024, bias=False) -) -########## -Embedding( - (word_embeddings): Embedding(65024, 4096) -) -########## -Embedding(65024, 4096) -Reading embedding layer -########## -RotaryEmbedding() -########## -GLMTransformer( - (layers): ModuleList( - (0-27): 28 x GLMBlock( - (input_layernorm): RMSNorm() - (self_attention): SelfAttention( - (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) - (core_attention): CoreAttention( - (attention_dropout): Dropout(p=0.0, inplace=False) - ) - (dense): Linear(in_features=4096, out_features=4096, bias=False) - ) - (post_attention_layernorm): RMSNorm() - (mlp): MLP( - (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) - (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) - ) - ) - ) - (final_layernorm): RMSNorm() -) -########## -ModuleList( - (0-27): 28 x GLMBlock( - (input_layernorm): RMSNorm() - (self_attention): SelfAttention( - (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) - (core_attention): CoreAttention( - (attention_dropout): Dropout(p=0.0, inplace=False) - ) - (dense): Linear(in_features=4096, out_features=4096, bias=False) - ) - (post_attention_layernorm): RMSNorm() - (mlp): MLP( - (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) - (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) - ) - ) -) -########## -GLMBlock( - (input_layernorm): RMSNorm() - (self_attention): SelfAttention( - (query_key_value): Linear(in_features=4096, out_features=4608, bias=True) - (core_attention): CoreAttention( - (attention_dropout): Dropout(p=0.0, inplace=False) - ) - (dense): Linear(in_features=4096, out_features=4096, bias=False) - ) - (post_attention_layernorm): RMSNorm() - (mlp): MLP( - (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False) - (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False) - ) -) -Reading decoder layer 0 -['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_allocate_memory', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_is_hf_initialized', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', '_state_dict_hooks', '_state_dict_pre_hooks', '_version', '_wrapped_call_impl', 'add_module', 'apply', 'bfloat16', 'buffers', 'call_super_init', 'children', 'compile', 'core_attention', 'cpu', 'cuda', 'dense', 'double', 'dump_patches', 'eval', 'extra_repr', 'float', 'forward', 'get_buffer', 'get_extra_state', 'get_parameter', 'get_submodule', 'half', 'hidden_size_per_attention_head', 'ipu', 'layer_number', 'load_state_dict', 'modules', 'mtia', 'multi_query_attention', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'num_attention_heads_per_partition', 'num_multi_query_groups_per_partition', 'parameters', 'projection_size', 'qkv_hidden_size', 'query_key_value', 'register_backward_hook', 'register_buffer', 'register_forward_hook', 'register_forward_pre_hook', 'register_full_backward_hook', 'register_full_backward_pre_hook', 'register_load_state_dict_post_hook', 'register_load_state_dict_pre_hook', 'register_module', 'register_parameter', 'register_state_dict_post_hook', 'register_state_dict_pre_hook', 'requires_grad_', 'set_extra_state', 'set_submodule', 'share_memory', 'state_dict', 'to', 'to_empty', 'train', 'training', 'type', 'xpu', 'zero_grad'] -True diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 8626e0986..89c531683 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -21,11 +21,11 @@ class Model: - def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - self.context_length = config.max_position_embeddings - self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings + def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): + self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length + self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1 # default is -1 in GroupQueryAttention kernel - self.intermediate_size = config.intermediate_size + self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size self.hidden_size = config.hidden_size self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads @@ -34,7 +34,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245 self.num_attn_heads = config.num_attention_heads self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads - self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers + self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers self.vocab_size = config.vocab_size self.activation = config.hidden_activation if hasattr(config, "hidden_activation") and config.hidden_activation is not None else config.hidden_act @@ -1808,15 +1808,13 @@ def make_model(self, input_path): # Loop through model and map each module to ONNX/ORT ops self.layer_id = 0 - model_name = "ChatGLM" if "ChatGLM" in model.__class__.__name__ else "" for module in model.modules(): - # print("##########") - # print(module) + if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_embeds: # Embedding layer - # print("Reading embedding layer") + print("Reading embedding layer") self.make_embedding(module.weight.detach().numpy()) else: # Exclude embedding layer from model @@ -1834,30 +1832,25 @@ def make_model(self, input_path): self.make_layer(self.layer_id, module) self.layer_id += 1 - elif self.layer_id == self.num_layers and self.has_final_norm(module, model, model_name): + elif self.layer_id == self.num_layers and self.has_final_norm(module, model): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") - # print(self.layernorm_attrs["root_input"]) self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") - # self.layernorm_attrs["root_input"] = self.layernorm_attrs["output_0"] - # print(self.layernorm_attrs["root_input"]) + elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: # Language modeling head (SkipLayerNorm --> logits) - # print(self.layernorm_attrs["root_input"]) print("Reading LM head") self.make_lm_head(module) del model - def has_final_norm(self, module, model, model_name): + def has_final_norm(self, module, model): # Hugging Face names hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - if(model_name == "ChatGLM"): - hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm - else: - hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm + + hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm # GGUF names gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm @@ -2697,7 +2690,15 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.attention_attrs["use_rotemb_in_attn"] = True self.attention_attrs["use_packed_matmul"] = True self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"] - + + def has_final_norm(self, module, model): + # Hugging Face names + hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm + hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm + # GGUF names + gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm + return hf_norm or hf_final_layernorm or gguf_final_norm + def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) @@ -2715,11 +2716,6 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads - if multi_query_attention: - qkv_hidden_size = projection_size + 2 * hidden_size_per_attention_head * multi_query_group_num - else: - qkv_hidden_size = 3 * projection_size - # Reshape the QKV weight qkv_weight = attention.query_key_value.weight.T @@ -2783,7 +2779,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # # root_input # | - # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h + # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same # | # Activation # | @@ -2885,10 +2881,10 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "ChatGLMModel": - #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way - config.max_position_embeddings = config.seq_length # Max sequence length a model can handle - config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer - config.num_hidden_layers = config.num_layers + # #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way + # config.max_position_embeddings = config.seq_length # Max sequence length a model can handle + # config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer + # config.num_hidden_layers = config.num_layers config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: From dfdbf4f60883c8bc98a09543eb9332c7aced42ff Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 19 Sep 2024 17:44:48 +0000 Subject: [PATCH 05/17] refractor_codebase --- src/python/py/models/builder.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 89c531683..c5f8f297e 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -2881,10 +2881,6 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "ChatGLMModel": - # #TODO: @amd-sudo-sh: Encapsulate the config parsing in a better way - # config.max_position_embeddings = config.seq_length # Max sequence length a model can handle - # config.intermediate_size = config.ffn_hidden_size # Size of feed-forward network's hidden layer - # config.num_hidden_layers = config.num_layers config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: From b20e07b70b4f57b3ff796dfe1da0ab546af99c05 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 19 Sep 2024 18:21:39 +0000 Subject: [PATCH 06/17] chatglm_support_for_model_builder --- src/python/py/models/builder.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index c5f8f297e..5e31c55f9 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -531,10 +531,8 @@ def make_gather(self, name, inputs, axis): self.make_value_info(output, TensorProto.INT64, shape=[]) def make_split(self, name, inputs, dtype, shape, axis, num_splits): - #TODO: @amd-sudo-sh: Currently it supports num_splits = 2 - # Splits the input tensor into num_splits based on the axis + # Splits the input tensor into num_splits based on the axis and shape outputs = [f"{name}/output_{i}" for i in range(num_splits)] - # split = [num_splits for i in range(num_splits)] self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis) for output in outputs: self.make_value_info(output, dtype, shape=shape) @@ -1836,7 +1834,6 @@ def make_model(self, input_path): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") - elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: @@ -1849,9 +1846,7 @@ def make_model(self, input_path): def has_final_norm(self, module, model): # Hugging Face names hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm - # GGUF names gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm return hf_norm or hf_final_layernorm or gguf_final_norm @@ -2708,11 +2703,8 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): hidden_size = self.hidden_size num_attention_heads = self.num_attn_heads kv_channels = self.kv_channels - # head_size = self.head_size - projection_size = kv_channels * num_attention_heads hidden_size_per_attention_head = projection_size // num_attention_heads - multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads From 0bdf8434af5157ab1a79a3a8a913b3d802c43132 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Sun, 22 Sep 2024 17:07:18 +0000 Subject: [PATCH 07/17] complete chatglm3 --- src/python/py/models/builder.py | 262 +++++++++++++----------- src/python/py/models/quantized_model.py | 238 +++++++++++++++++---- 2 files changed, 344 insertions(+), 156 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 5e31c55f9..2cc70d1b6 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1443,14 +1443,21 @@ def make_mlp(self, layer_id, mlp, root_input): raise NotImplementedError(f"The MLP layer type is not set.") def make_mlp_unpacked(self, layer_id, mlp, root_input): + packed_proj = getattr(mlp, "gate_up_proj", None) or getattr( + mlp, "dense_h_to_4h", None + ) mlp.gate_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) - mlp.gate_proj.weight = torch.nn.Parameter(mlp.gate_up_proj.weight[ : self.intermediate_size, :]) + mlp.gate_proj.weight = torch.nn.Parameter( + packed_proj.weight[: self.intermediate_size, :] + ) mlp.up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size) - mlp.up_proj.weight = torch.nn.Parameter(mlp.gate_up_proj.weight[self.intermediate_size :, :]) + mlp.up_proj.weight = torch.nn.Parameter( + packed_proj.weight[self.intermediate_size :, :] + ) # Delete original packed weights - del mlp.gate_up_proj + del packed_proj def make_mlp_proj(self, layer_id, mlp, root_input): # Make nodes for the MLP subgraph @@ -1480,8 +1487,11 @@ def make_mlp_proj(self, layer_id, mlp, root_input): self.make_mul(mul_name, mul_inputs, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) # Make output MatMul node + down_proj = getattr(mlp, "down_proj", None) or getattr( + mlp, "dense_4h_to_h", None + ) down_basename = f"/model/layers.{layer_id}/mlp/down_proj/MatMul" - down_name = self.make_matmul(mlp.down_proj, down_basename, f"{mul_name}/output_0") + down_name = self.make_matmul(down_proj, down_basename, f"{mul_name}/output_0") # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" @@ -1679,7 +1689,7 @@ def make_swiglu(self, layer_id, root_input, activation, domain): # # root_input (GateProjMatMul) # / \ - # split/output_0 split/output_1 + # split/output_0 split/output_1 # / | | # ActFunc | | # \ | | @@ -1688,7 +1698,7 @@ def make_swiglu(self, layer_id, root_input, activation, domain): # \ | # Mul act_name = f"/model/layers.{layer_id}/mlp/act_fn" - + # Split the input into two parts along the last dimension # When using swiglu the MLP projects to 2 times the intermediate_size split_act_name = f"{act_name}/split" @@ -1713,12 +1723,11 @@ def make_swiglu(self, layer_id, root_input, activation, domain): return mul_act_name_1 - def make_activation(self, layer_id, root_input): - if self.activation in {"swiglu"}: - output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None) - elif self.activation in {"silu", "swish"}: + # if self.activation in {"swiglu"}: + # output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None) + if self.activation in {"silu", "swish", "swiglu"}: output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None) elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}: output_name = self.make_gelu(layer_id, root_input, activation="FastGelu") @@ -1798,7 +1807,18 @@ def make_model(self, input_path): from onnxruntime_genai.models.quantized_model import QuantModel q_size = self.num_attn_heads * self.head_size kv_size = self.num_kv_heads * self.head_size - model = QuantModel.from_pretrained(self.quant_type, input_path, self.quant_attrs["bits"], self.quant_attrs["group_size"], self.quant_attrs["use_g_idx"], q_size, kv_size, self.intermediate_size, self.num_layers) + model = QuantModel.from_pretrained( + self.quant_type, + input_path, + self.quant_attrs["bits"], + self.quant_attrs["group_size"], + self.quant_attrs["use_g_idx"], + q_size, + kv_size, + self.intermediate_size, + self.num_layers, + self.model_type, + ) else: # Load PyTorch model extra_kwargs = {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {} @@ -1829,7 +1849,7 @@ def make_model(self, input_path): print(f"Reading decoder layer {self.layer_id}") self.make_layer(self.layer_id, module) self.layer_id += 1 - + elif self.layer_id == self.num_layers and self.has_final_norm(module, model): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") @@ -2685,7 +2705,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.attention_attrs["use_rotemb_in_attn"] = True self.attention_attrs["use_packed_matmul"] = True self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"] - + def has_final_norm(self, module, model): # Hugging Face names hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm @@ -2696,111 +2716,119 @@ def has_final_norm(self, module, model): def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) - - - def make_attention(self, layer_id, attention, root_input, **kwargs): - #Designed from SelfAttention function of medeling_chatglm.py - hidden_size = self.hidden_size - num_attention_heads = self.num_attn_heads - kv_channels = self.kv_channels - projection_size = kv_channels * num_attention_heads - hidden_size_per_attention_head = projection_size // num_attention_heads - multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" - multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads - - # Reshape the QKV weight - qkv_weight = attention.query_key_value.weight.T - - if multi_query_attention: - q_weight, k_weight, v_weight = qkv_weight.split( - [ - num_attention_heads * hidden_size_per_attention_head, - multi_query_group_num * hidden_size_per_attention_head, - multi_query_group_num * hidden_size_per_attention_head, - ], - dim=-1 - ) - else: - q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1) - - # Reshape the QKV bias if it exists - if attention.query_key_value.bias is not None: - qkv_bias = attention.query_key_value.bias - if multi_query_attention: - q_bias, k_bias, v_bias = qkv_bias.split( - [ - num_attention_heads * hidden_size_per_attention_head, - multi_query_group_num * hidden_size_per_attention_head, - multi_query_group_num * hidden_size_per_attention_head, - ] - ) - else: - q_bias, k_bias, v_bias = qkv_bias.chunk(3) - else: - q_bias = k_bias = v_bias = None - - # Create separate Q, K, V projections - attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None) - attention.q_proj.weight = torch.nn.Parameter(q_weight.T) - if q_bias is not None: - attention.q_proj.bias = torch.nn.Parameter(q_bias) - - kv_size = multi_query_group_num * hidden_size_per_attention_head - - attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None) - attention.k_proj.weight = torch.nn.Parameter(k_weight.T) - if k_bias is not None: - attention.k_proj.bias = torch.nn.Parameter(k_bias) - - attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None) - attention.v_proj.weight = torch.nn.Parameter(v_weight.T) - if v_bias is not None: - attention.v_proj.bias = torch.nn.Parameter(v_bias) - # Remove the original combined QKV projection - del attention.query_key_value - del qkv_weight - del qkv_bias - # Add dummy rotary_emb attribute - attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})() - - super().make_attention(layer_id, attention, root_input, **kwargs) + def make_attention(self, layer_id, attention, root_input, **kwargs): + if self.quant_type is None: + super().make_attention_unpacked(layer_id, attention, root_input, **kwargs) + return super().make_attention(layer_id, attention, root_input, **kwargs) + + # def make_attention(self, layer_id, attention, root_input, **kwargs): + # #Designed from SelfAttention function of medeling_chatglm.py + # hidden_size = self.hidden_size + # num_attention_heads = self.num_attn_heads + # kv_channels = self.kv_channels + # projection_size = kv_channels * num_attention_heads + # hidden_size_per_attention_head = projection_size // num_attention_heads + # multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" + # multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads + + # # Reshape the QKV weight + # qkv_weight = attention.query_key_value.weight.T + + # if multi_query_attention: + # q_weight, k_weight, v_weight = qkv_weight.split( + # [ + # num_attention_heads * hidden_size_per_attention_head, + # multi_query_group_num * hidden_size_per_attention_head, + # multi_query_group_num * hidden_size_per_attention_head, + # ], + # dim=-1 + # ) + # else: + # q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1) + + # # Reshape the QKV bias if it exists + # if attention.query_key_value.bias is not None: + # qkv_bias = attention.query_key_value.bias + # if multi_query_attention: + # q_bias, k_bias, v_bias = qkv_bias.split( + # [ + # num_attention_heads * hidden_size_per_attention_head, + # multi_query_group_num * hidden_size_per_attention_head, + # multi_query_group_num * hidden_size_per_attention_head, + # ] + # ) + # else: + # q_bias, k_bias, v_bias = qkv_bias.chunk(3) + # else: + # q_bias = k_bias = v_bias = None + + # # Create separate Q, K, V projections + # attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None) + # attention.q_proj.weight = torch.nn.Parameter(q_weight.T) + # if q_bias is not None: + # attention.q_proj.bias = torch.nn.Parameter(q_bias) + + # kv_size = multi_query_group_num * hidden_size_per_attention_head + + # attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None) + # attention.k_proj.weight = torch.nn.Parameter(k_weight.T) + # if k_bias is not None: + # attention.k_proj.bias = torch.nn.Parameter(k_bias) + + # attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None) + # attention.v_proj.weight = torch.nn.Parameter(v_weight.T) + # if v_bias is not None: + # attention.v_proj.bias = torch.nn.Parameter(v_bias) + + # # Remove the original combined QKV projection + # del attention.query_key_value + # del qkv_weight + # del qkv_bias + # # Add dummy rotary_emb attribute + # attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})() + + # super().make_attention(layer_id, attention, root_input, **kwargs) def make_mlp_proj(self, layer_id, mlp, root_input): - # Make nodes for the MLP subgraph - # - # root_input - # | - # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same - # | - # Activation - # | - # dense_4h_to_h - # - # Make MatMul nodes - - up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" - up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input) - # Make activation node(s) - act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0") - - down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul" - down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0") - # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm - self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" - - - def make_layer(self, layer_id, layer): - # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model): - self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") - self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"]) - self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") - self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) + if self.quant_type is None: + super().make_mlp_unpacked(layer_id, mlp, root_input) + super().make_mlp_proj(layer_id, mlp, root_input) - self.layernorm_attrs["first_layernorm"] = False - if layer_id == self.num_layers - 1: - # Norm after last decoder layer of model (last layer --> norm) - self.layernorm_attrs["last_layernorm"] = True + # def make_mlp_proj(self, layer_id, mlp, root_input): + # # Make nodes for the MLP subgraph + # # + # # root_input + # # | + # # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same + # # | + # # Activation + # # | + # # dense_4h_to_h + # # + # # Make MatMul nodes + + # up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" + # up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input) + # # Make activation node(s) + # act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0") + + # down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul" + # down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0") + # # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm + # self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" + + # def make_layer(self, layer_id, layer): + # # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model): + # self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") + # self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"]) + # self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") + # self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) + + # self.layernorm_attrs["first_layernorm"] = False + # if layer_id == self.num_layers - 1: + # # Norm after last decoder layer of model (last layer --> norm) + # self.layernorm_attrs["last_layernorm"] = True def check_extra_options(kv_pairs): @@ -2872,11 +2900,13 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) - elif config.architectures[0] == "ChatGLMModel": + elif config.architectures[0] == "ChatGLMForConditionalGeneration": config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: - raise NotImplementedError(f"The {hf_name} model is not currently supported.") + raise NotImplementedError( + f"The {hf_name} model is not currently supported. Got {config}" + ) # Make ONNX model onnx_model.make_model(input_path) diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index f15f21cb9..f2717a3ee 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -32,6 +32,16 @@ def __init__(self, bits, group_size): self.bits = bits self.group_size = group_size + def set_properties(self, quant_type: str): + if quant_type == "awq": + self.out_features = self.scales.shape[1] + self.in_features = self.qweight.shape[0] + elif quant_type == "gptq": + self.out_features = self.qweight.shape[1] + self.in_features = self.q_proj.g_idx.shape[0] + else: + raise NotImplementedError(f"The {quant_type} quantization method is not recognized.") + def __str__(self): qweight = f"qweight = {self.qweight.shape}, {self.qweight}\n" scales = f"scales = {self.scales.shape}, {self.scales}\n" @@ -60,6 +70,21 @@ def __init__(self, bits, group_size): self.o_proj = QuantizedTensorModule(bits, group_size) self.rotary_emb = TensorModule() + def set_properties(self, quant_type: str): + self.q_proj.set_properties(quant_type) + self.k_proj.set_properties(quant_type) + self.v_proj.set_properties(quant_type) + self.o_proj.set_properties(quant_type) + +class QuantizedChatglm3Attention: + def __init__(self, bits, group_size): + self.query_key_value = QuantizedTensorModule(bits, group_size) + self.dense = QuantizedTensorModule(bits, group_size) + self.rotary_emb = TensorModule() + + def set_properties(self, quant_type: str): + self.query_key_value.set_properties(quant_type) + self.dense.set_properties(quant_type) class QuantizedMLP: def __init__(self, bits, group_size): @@ -69,6 +94,21 @@ def __init__(self, bits, group_size): self.fc1 = QuantizedTensorModule(bits, group_size) self.fc2 = QuantizedTensorModule(bits, group_size) + def set_properties(self, quant_type: str): + self.gate_proj.set_properties(quant_type) + self.up_proj.set_properties(quant_type) + self.down_proj.set_properties(quant_type) + self.fc1.set_properties(quant_type) + self.fc2.set_properties(quant_type) + +class QuantizedChatglm3MLP: + def __init__(self, bits, group_size): + self.dense_4h_to_h = QuantizedTensorModule(bits, group_size) + self.dense_h_to_4h = QuantizedTensorModule(bits, group_size) + + def set_properties(self, quant_type: str): + self.dense_4h_to_h.set_properties(quant_type) + self.dense_h_to_4h.set_properties(quant_type) class QuantizedDecoderLayer: def __init__(self, layer_id, bits, group_size): @@ -81,9 +121,27 @@ def __init__(self, layer_id, bits, group_size): def is_empty(self): return self.input_layernorm.weight is None + def set_properties(self, quant_type: str): + self.self_attn.set_properties(quant_type) + self.mlp.set_properties(quant_type) + +class QuantizedChatglm3EncoderLayer: + def __init__(self, layer_id, bits, group_size): + self.layer_id = layer_id + self.input_layernorm = TensorModule() + self.self_attention = QuantizedChatglm3Attention(bits, group_size) + self.post_attention_layernorm = TensorModule() + self.mlp = QuantizedChatglm3MLP(bits, group_size) + + def is_empty(self): + return self.input_layernorm.weight is None + + def set_properties(self, quant_type: str): + self.self_attention.set_properties(quant_type) + self.mlp.set_properties(quant_type) class QuantizedModel: - def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers): + def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str): self.quant_type = quant_type self.embedding = TensorModule() self.final_norm = TensorModule() @@ -91,34 +149,48 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in self.layers = {} self.num_layers = num_layers + q_layer_cls = QuantizedDecoderLayer + # if model_type == "ChatGLMForConditionalGeneration": + # q_layer_cls = QuantizedChatglm3EncoderLayer + # print(q_layer_cls) + layer_id = 0 for weight_file in os.listdir(input_path): if weight_file.endswith(".safetensors"): - module = self.layers.setdefault(layer_id, QuantizedDecoderLayer(layer_id, bits, group_size)) + module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size)) weights = load_file(os.path.join(input_path, weight_file)) + for name, _ in weights.items(): + print(name) # Map weights to modules for name, tensor in weights.items(): + print(name) if tensor.dtype == torch.bfloat16: # Cast bfloat16 to float32 since NumPy does not support bfloat16 tensor = tensor.to(torch.float32) - - if name == "model.embed_tokens.weight": + if name == "model.embed_tokens.weight" or name == "transformer.embedding.word_embeddings.weight": self.embedding.weight = tensor - elif name == "model.norm.weight": + elif name == "model.norm.weight" or name == "transformer.encoder.final_layernorm.weight": self.final_norm.weight = tensor - elif name == "model.norm.bias": + elif name == "model.norm.bias" or name == "transformer.encoder.final_layernorm.bias": self.final_norm.bias = tensor - elif name == "lm_head.weight": + elif name == "lm_head.weight" or name == "transformer.output_layer.weight": self.lm_head.weight = tensor - elif name == "lm_head.bias": + elif name == "lm_head.bias" or name == "transformer.output_layer.bias": self.lm_head.bias = tensor + elif name == "transformer.rotary_pos_emb.inv_freq": + # transformer.rotary_pos_emb.inv_freq in ChatGLM3. + # Skip rotary embedding weights since they can be re-calculated when looping through the model + continue else: + if name.startswith("transformer.encoder"): + # Chatglm3, e.g., transformer.encoder.layers.0.input_layernorm.weight + name = name.replace("transformer.encoder", "model") curr_layer_id = int(name.split(".")[2]) if curr_layer_id != layer_id: # Switch layer module used layer_id = curr_layer_id - module = self.layers.setdefault(layer_id, QuantizedDecoderLayer(layer_id, bits, group_size)) + module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size)) # Map weights and biases of norm, attention, and feed-forward network # Graph order is input_layernorm --> q_proj/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj @@ -177,27 +249,92 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.bias$", name)): # model.layers.layer_id.self_attn.v_proj.bias module.self_attn.v_proj.bias = tensor - elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.qweight$", name)): + # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qweight$", name)): + # # model.layers.layer_id.self_attention.query_key_value.qweight + # module.self_attention.query_key_value.qweight = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.scales$", name)): + # # model.layers.layer_id.self_attention.query_key_value.scales + # module.self_attention.query_key_value.scales = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qzeros$", name)): + # # model.layers.layer_id.self_attention.query_key_value.qzeros + # module.self_attention.query_key_value.qzeros = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.g_idx$", name)): + # # model.layers.layer_id.self_attention.query_key_value.g_idx + # module.self_attention.query_key_value.g_idx = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.bias$", name)): + # # model.layers.layer_id.self_attention.query_key_value.bias + # module.self_attention.query_key_value.bias = tensor + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qweight$", name)): # model.layers.layer_id.self_attn.o_proj.qweight + # model.layers.layer_id.self_attention.dense.qweight module.self_attn.o_proj.qweight = tensor - elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.scales$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.scales$", name)): # model.layers.layer_id.self_attn.o_proj.scales + # model.layers.layer_id.self_attention.dense.scales module.self_attn.o_proj.scales = tensor - elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.qzeros$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qzeros$", name)): # model.layers.layer_id.self_attn.o_proj.qzeros + # model.layers.layer_id.self_attention.dense.qzeros module.self_attn.o_proj.qzeros = tensor - elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.g_idx$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.g_idx$", name)): # model.layers.layer_id.self_attn.o_proj.g_idx + # model.layers.layer_id.self_attention.dense.g_idx module.self_attn.o_proj.g_idx = tensor - elif bool(re.match(r"^model.layers\.\d+\.self_attn.o_proj\.bias$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.bias$", name)): # model.layers.layer_id.self_attn.o_proj.bias + # model.layers.layer_id.self_attention.dense.bias module.self_attn.o_proj.bias = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qweight$", name)): + # # model.layers.layer_id.self_attention.dense.qweight + # module.self_attention.dense.qweight = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.scales$", name)): + # # model.layers.layer_id.self_attention.dense.scales + # module.self_attention.dense.scales = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qzeros$", name)): + # # model.layers.layer_id.self_attention.dense.qzeros + # module.self_attention.dense.qzeros = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.g_idx$", name)): + # # model.layers.layer_id.self_attention.dense.g_idx + # module.self_attention.dense.g_idx = tensor + # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.bias$", name)): + # # model.layers.layer_id.self_attention.dense.bias + # module.self_attention.dense.bias = tensor elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.weight$", name)): # model.layers.layer_id.post_attention_layernorm.weight module.post_attention_layernorm.weight = tensor elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.bias$", name)): # model.layers.layer_id.post_attention_layernorm.bias module.post_attention_layernorm.bias = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qweight$", name)): + # # model.layers.layer_id.mlp.dense_4h_to_h.qweight + # module.mlp.dense_4h_to_h.qweight = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.scales$", name)): + # # model.layers.layer_id.mlp.dense_4h_to_h.scales + # module.mlp.dense_4h_to_h.scales = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qzeros$", name)): + # # model.layers.layer_id.mlp.dense_4h_to_h.qzeros + # module.mlp.dense_4h_to_h.qzeros = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.g_idx$", name)): + # # model.layers.layer_id.mlp.dense_4h_to_h.g_idx + # module.mlp.dense_4h_to_h.g_idx = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.bias$", name)): + # # model.layers.layer_id.mlp.dense_4h_to_h.bias + # module.mlp.dense_4h_to_h.bias = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qweight$", name)): + # # model.layers.layer_id.mlp.dense_h_to_4h.qweight + # module.mlp.dense_h_to_4h.qweight = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.scales$", name)): + # # model.layers.layer_id.mlp.dense_h_to_4h.scales + # module.mlp.dense_h_to_4h.scales = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qzeros$", name)): + # # model.layers.layer_id.mlp.dense_h_to_4h.qzeros + # module.mlp.dense_h_to_4h.qzeros = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.g_idx$", name)): + # # model.layers.layer_id.mlp.dense_h_to_4h.g_idx + # module.mlp.dense_h_to_4h.g_idx = tensor + # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.bias$", name)): + # # model.layers.layer_id.mlp.dense_h_to_4h.bias + # module.mlp.dense_h_to_4h.bias = tensor elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_proj\.qweight$", name)): # model.layers.layer_id.mlp.gate_proj.qweight module.mlp.gate_proj.qweight = tensor @@ -228,62 +365,81 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in elif bool(re.match(r"^model.layers\.\d+\.mlp.up_proj\.bias$", name)): # model.layers.layer_id.mlp.up_proj.bias module.mlp.up_proj.bias = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.qweight$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.qweight$", name)): # model.layers.layer_id.mlp.down_proj.qweight + # model.layers.layer_id.mlp.dense_4h_to_h.qweight module.mlp.down_proj.qweight = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.scales$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.scales$", name)): # model.layers.layer_id.mlp.down_proj.scales + # model.layers.layer_id.mlp.dense_4h_to_h.scales module.mlp.down_proj.scales = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.qzeros$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.qzeros$", name)): # model.layers.layer_id.mlp.down_proj.qzeros + # model.layers.layer_id.mlp.dense_4h_to_h.qzeros module.mlp.down_proj.qzeros = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.g_idx$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.g_idx$", name)): # model.layers.layer_id.mlp.down_proj.g_idx + # model.layers.layer_id.mlp.dense_4h_to_h.g_idx module.mlp.down_proj.g_idx = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.down_proj\.bias$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(down_proj|dense_4h_to_h)\.bias$", name)): # model.layers.layer_id.mlp.down_proj.bias + # model.layers.layer_id.mlp.dense_4h_to_h.bias module.mlp.down_proj.bias = tensor # Match against fused layers - elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.qweight$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.qweight$", name)): # model.layers.layer_id.self_attn.qkv_proj.qweight + # model.layers.layer_id.self_attention.query_key_value.qweight q_dim = q_size // (32 // bits) if quant_type == "awq" else q_size kv_dim = kv_size // (32 // bits) if quant_type == "awq" else kv_size module.self_attn.q_proj.qweight = tensor[:, : q_dim] module.self_attn.k_proj.qweight = tensor[:, q_dim : q_dim + kv_dim] module.self_attn.v_proj.qweight = tensor[:, q_dim + kv_dim :] - elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.scales$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.scales$", name)): # model.layers.layer_id.self_attn.qkv_proj.scales + # model.layers.layer_id.self_attention.query_key_value.scales module.self_attn.q_proj.scales = tensor[:, : q_size] module.self_attn.k_proj.scales = tensor[:, q_size : q_size + kv_size] module.self_attn.v_proj.scales = tensor[:, q_size + kv_size :] - elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.qzeros$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.qzeros$", name)): # model.layers.layer_id.self_attn.qkv_proj.qzeros + # model.layers.layer_id.self_attention.query_key_value.qzeros q_dim = q_size // (32 // bits) if quant_type in {"awq", "gptq"} else q_size kv_dim = kv_size // (32 // bits) if quant_type in {"awq", "gptq"} else kv_size module.self_attn.q_proj.qzeros = tensor[:, : q_dim] module.self_attn.k_proj.qzeros = tensor[:, q_dim : q_dim + kv_dim] module.self_attn.v_proj.qzeros = tensor[:, q_dim + kv_dim :] - elif bool(re.match(r"^model.layers\.\d+\.self_attn.qkv_proj\.g_idx$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.g_idx$", name)): # model.layers.layer_id.self_attn.qkv_proj.g_ix + # model.layers.layer_id.self_attention.query_key_value.g_idx module.self_attn.q_proj.g_idx = tensor module.self_attn.k_proj.g_idx = tensor module.self_attn.v_proj.g_idx = tensor - elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.qweight$", name)): + elif bool(re.match(r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.bias$", name)): + # model.layers.layer_id.self_attn.qkv_proj.bias + # model.layers.layer_id.self_attention.query_key_value.bias + module.self_attn.q_proj.bias = tensor[: q_size] + module.self_attn.k_proj.bias = tensor[q_size : q_size + kv_size] + module.self_attn.v_proj.bias = tensor[q_size + kv_size : ] + elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.qweight$", name)): # model.layers.layer_id.mlp.gate_up_proj.qweight + # model.layers.layer_id.mlp.dense_h_to_4h.qweight intermediate_dim = intermediate_size // (32 // bits) if quant_type == "awq" else intermediate_size module.mlp.gate_proj.qweight = tensor[:, : intermediate_dim] module.mlp.up_proj.qweight = tensor[:, intermediate_dim :] - elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.scales$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.scales$", name)): # model.layers.layer_id.mlp.gate_up_proj.scales + # model.layers.layer_id.mlp.dense_h_to_4h.scales module.mlp.gate_proj.scales = tensor[:, : intermediate_size] module.mlp.up_proj.scales = tensor[:, intermediate_size :] - elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.qzeros$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.qzeros$", name)): # model.layers.layer_id.mlp.gate_up_proj.qzeros + # model.layers.layer_id.mlp.dense_h_to_4h.qzeros intermediate_dim = intermediate_size // (32 // bits) if quant_type in {"awq", "gptq"} else intermediate_size module.mlp.gate_proj.qzeros = tensor[:, : intermediate_dim] module.mlp.up_proj.qzeros = tensor[:, intermediate_dim :] - elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_up_proj\.g_idx$", name)): + elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.g_idx$", name)): # model.layers.layer_id.mlp.gate_up_proj.g_idx + # model.layers.layer_id.mlp.dense_h_to_4h.g_idx module.mlp.gate_proj.g_idx = tensor module.mlp.up_proj.g_idx = tensor else: @@ -295,7 +451,7 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in self.lm_head.weight = self.embedding.weight if self.lm_head.bias is not None: self.lm_head.bias = self.embedding.bias - + # Sort list of layers by layer id self.layers = list(self.layers.values()) self.layers.sort(key=lambda m: m.layer_id) @@ -459,7 +615,7 @@ def dequant_weight(self, module): scale_mat = scales[g_idx] scale_zeros_mat = scale_zeros[g_idx] qdq_weight_T = intweight * scale_mat - scale_zeros_mat.half() - + # Store unpacked result in `qweight` module.qweight = qdq_weight_T.T @@ -484,8 +640,8 @@ def pack_ort_format(self, module, intweight): Pack `scales`, `qzeros`, and `qweight` to ORT format """ if module.bits != 4: - raise NotImplementedError(f"{modue.bits}-bit quantization in ORT is not currently supported by this tool.") - + raise NotImplementedError(f"{module.bits}-bit quantization in ORT is not currently supported by this tool.") + intzeros_pt = module.qzeros.T if module.qzeros.dtype == module.scales.dtype else module.qzeros.T.byte() intweight_pt = intweight.byte() block_size = module.group_size @@ -518,8 +674,8 @@ def pack_ort_format(self, module, intweight): class AWQModel(QuantizedModel): - def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers): - super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) + def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str): + super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) # Unpack and repack all `QuantizedTensorModule` classes in model for i, layer in enumerate(self.layers): @@ -528,7 +684,9 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in print(f"Unpacking and repacking layer {i}") # Unpack and repack all `QuantizedTensorModule` classes in attention - for name, q_tensors in layer.self_attn.__dict__.items(): + self_attn = getattr(layer, "self_attn", None) or getattr(layer, "self_attention", None) + for name, q_tensors in self_attn.__dict__.items(): + print(name) if isinstance(q_tensors, QuantizedTensorModule) and q_tensors.qweight is not None: self.unpack(q_tensors) self.repack(q_tensors) @@ -585,8 +743,8 @@ def reverse_reorder_tensor(self, tensor, bits): class GPTQModel(QuantizedModel): - def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers): - super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) + def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type: str): + super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) # Unpack and repack all `QuantizedTensorModule` classes in model for i, layer in enumerate(self.layers): @@ -643,17 +801,17 @@ def __init__(self, module): class QuantModel: @staticmethod - def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers): + def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type:str): """ Unpack quantized weights in PyTorch models, store them in a standard format, and repack them into ONNX Runtime's format. Also performs any pre-processing and post-processing when unpacking the quantized weights. """ if quant_type == "awq": - model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) + model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) elif quant_type == "gptq": - model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers) + model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type) else: raise NotImplementedError(f"The {quant_type} quantized model is not currently supported.") - return model \ No newline at end of file + return model From eb10e51904aebac2eb671d54e5dbf01979b7ae04 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Mon, 23 Sep 2024 21:48:49 +0000 Subject: [PATCH 08/17] Cleanup --- src/python/py/models/builder.py | 145 ---------------------- src/python/py/models/quantized_model.py | 152 +++--------------------- 2 files changed, 15 insertions(+), 282 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 2cc70d1b6..ec019b324 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1684,49 +1684,7 @@ def make_gelu(self, layer_id, root_input, activation): return gelu_name - def make_swiglu(self, layer_id, root_input, activation, domain): - # Make nodes for this activation subgraph - # - # root_input (GateProjMatMul) - # / \ - # split/output_0 split/output_1 - # / | | - # ActFunc | | - # \ | | - # Mul | - # \ | - # \ | - # Mul - act_name = f"/model/layers.{layer_id}/mlp/act_fn" - - # Split the input into two parts along the last dimension - # When using swiglu the MLP projects to 2 times the intermediate_size - split_act_name = f"{act_name}/split" - num_splits = 2 - self.make_split(split_act_name, root_input, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size], axis = -1, num_splits=num_splits) - split_act_out_name_0 = f"{split_act_name}/output_0" - split_act_out_name_1 = f"{split_act_name}/output_1" - - act_name = f"{split_act_name}/{activation}" - act_func_output = f"{act_name}/output_0" - self.make_node(activation, inputs=[split_act_out_name_0], outputs=[act_func_output], name=act_name, domain=domain) - self.make_value_info(act_func_output, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) - - mul_act_name_0 = f"{act_name}/Mul_0" - mul_act_inputs_0 = [split_act_out_name_0, act_func_output] - self.make_mul(mul_act_name_0, mul_act_inputs_0, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) - - mul_act_name_1 = f"{act_name}/Mul_1" - mul_0_output = f"{mul_act_name_0}/output_0" - mul_act_inputs_1 = [split_act_out_name_1, mul_0_output] - self.make_mul(mul_act_name_1, mul_act_inputs_1, dtype=self.io_dtype, shape=["batch_size", "sequence_length", self.intermediate_size]) - - return mul_act_name_1 - def make_activation(self, layer_id, root_input): - - # if self.activation in {"swiglu"}: - # output_name = self.make_swiglu(layer_id, root_input, activation="Sigmoid", domain=None) if self.activation in {"silu", "swish", "swiglu"}: output_name = self.make_activation_with_mul(layer_id, root_input, activation="Sigmoid", domain=None) elif self.activation in {"gelu_new", "gelu_fast", "gelu_pytorch_tanh"}: @@ -1817,7 +1775,6 @@ def make_model(self, input_path): kv_size, self.intermediate_size, self.num_layers, - self.model_type, ) else: # Load PyTorch model @@ -2722,114 +2679,12 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): super().make_attention_unpacked(layer_id, attention, root_input, **kwargs) return super().make_attention(layer_id, attention, root_input, **kwargs) - # def make_attention(self, layer_id, attention, root_input, **kwargs): - # #Designed from SelfAttention function of medeling_chatglm.py - # hidden_size = self.hidden_size - # num_attention_heads = self.num_attn_heads - # kv_channels = self.kv_channels - # projection_size = kv_channels * num_attention_heads - # hidden_size_per_attention_head = projection_size // num_attention_heads - # multi_query_attention = self.attention_attrs["op_type"] == "GroupQueryAttention" - # multi_query_group_num = self.num_kv_heads if multi_query_attention else num_attention_heads - - # # Reshape the QKV weight - # qkv_weight = attention.query_key_value.weight.T - - # if multi_query_attention: - # q_weight, k_weight, v_weight = qkv_weight.split( - # [ - # num_attention_heads * hidden_size_per_attention_head, - # multi_query_group_num * hidden_size_per_attention_head, - # multi_query_group_num * hidden_size_per_attention_head, - # ], - # dim=-1 - # ) - # else: - # q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=-1) - - # # Reshape the QKV bias if it exists - # if attention.query_key_value.bias is not None: - # qkv_bias = attention.query_key_value.bias - # if multi_query_attention: - # q_bias, k_bias, v_bias = qkv_bias.split( - # [ - # num_attention_heads * hidden_size_per_attention_head, - # multi_query_group_num * hidden_size_per_attention_head, - # multi_query_group_num * hidden_size_per_attention_head, - # ] - # ) - # else: - # q_bias, k_bias, v_bias = qkv_bias.chunk(3) - # else: - # q_bias = k_bias = v_bias = None - - # # Create separate Q, K, V projections - # attention.q_proj = torch.nn.Linear(hidden_size, num_attention_heads * hidden_size_per_attention_head, bias=q_bias is not None) - # attention.q_proj.weight = torch.nn.Parameter(q_weight.T) - # if q_bias is not None: - # attention.q_proj.bias = torch.nn.Parameter(q_bias) - - # kv_size = multi_query_group_num * hidden_size_per_attention_head - - # attention.k_proj = torch.nn.Linear(hidden_size, kv_size, bias=k_bias is not None) - # attention.k_proj.weight = torch.nn.Parameter(k_weight.T) - # if k_bias is not None: - # attention.k_proj.bias = torch.nn.Parameter(k_bias) - - # attention.v_proj = torch.nn.Linear(hidden_size, kv_size, bias=v_bias is not None) - # attention.v_proj.weight = torch.nn.Parameter(v_weight.T) - # if v_bias is not None: - # attention.v_proj.bias = torch.nn.Parameter(v_bias) - - # # Remove the original combined QKV projection - # del attention.query_key_value - # del qkv_weight - # del qkv_bias - # # Add dummy rotary_emb attribute - # attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})() - - # super().make_attention(layer_id, attention, root_input, **kwargs) def make_mlp_proj(self, layer_id, mlp, root_input): if self.quant_type is None: super().make_mlp_unpacked(layer_id, mlp, root_input) super().make_mlp_proj(layer_id, mlp, root_input) - # def make_mlp_proj(self, layer_id, mlp, root_input): - # # Make nodes for the MLP subgraph - # # - # # root_input - # # | - # # dense_h_to_4h #Misnomer, it is increased to 2h instead of 4h, therefore in swiglu the intermediate size is same - # # | - # # Activation - # # | - # # dense_4h_to_h - # # - # # Make MatMul nodes - - # up_basename = f"/model/layers.{layer_id}/mlp/dense_h_to_4h/MatMul" - # up_name = self.make_matmul(mlp.dense_h_to_4h, up_basename, root_input) - # # Make activation node(s) - # act_fn_name = self.make_activation(layer_id, root_input=f"{up_name}/output_0") - - # down_basename = f"/model/layers.{layer_id}/mlp/dense_4h_to_h/MatMul" - # down_name = self.make_matmul(mlp.dense_4h_to_h, down_basename, f"{act_fn_name}/output_0") - # # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm - # self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" - - # def make_layer(self, layer_id, layer): - # # Each GLM encoder is defined as follows all LayerNorms are RMSNorms (Residual and Norm order same as LLama Model): - # self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") - # self.make_attention(layer_id, layer.self_attention, root_input=self.layernorm_attrs["output_0"]) - # self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention") - # self.make_mlp(layer_id, layer.mlp, root_input=self.layernorm_attrs["output_0"]) - - # self.layernorm_attrs["first_layernorm"] = False - # if layer_id == self.num_layers - 1: - # # Norm after last decoder layer of model (last layer --> norm) - # self.layernorm_attrs["last_layernorm"] = True - def check_extra_options(kv_pairs): if "use_8bits_moe" in kv_pairs: diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index f2717a3ee..108212952 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -32,16 +32,6 @@ def __init__(self, bits, group_size): self.bits = bits self.group_size = group_size - def set_properties(self, quant_type: str): - if quant_type == "awq": - self.out_features = self.scales.shape[1] - self.in_features = self.qweight.shape[0] - elif quant_type == "gptq": - self.out_features = self.qweight.shape[1] - self.in_features = self.q_proj.g_idx.shape[0] - else: - raise NotImplementedError(f"The {quant_type} quantization method is not recognized.") - def __str__(self): qweight = f"qweight = {self.qweight.shape}, {self.qweight}\n" scales = f"scales = {self.scales.shape}, {self.scales}\n" @@ -70,21 +60,6 @@ def __init__(self, bits, group_size): self.o_proj = QuantizedTensorModule(bits, group_size) self.rotary_emb = TensorModule() - def set_properties(self, quant_type: str): - self.q_proj.set_properties(quant_type) - self.k_proj.set_properties(quant_type) - self.v_proj.set_properties(quant_type) - self.o_proj.set_properties(quant_type) - -class QuantizedChatglm3Attention: - def __init__(self, bits, group_size): - self.query_key_value = QuantizedTensorModule(bits, group_size) - self.dense = QuantizedTensorModule(bits, group_size) - self.rotary_emb = TensorModule() - - def set_properties(self, quant_type: str): - self.query_key_value.set_properties(quant_type) - self.dense.set_properties(quant_type) class QuantizedMLP: def __init__(self, bits, group_size): @@ -94,21 +69,6 @@ def __init__(self, bits, group_size): self.fc1 = QuantizedTensorModule(bits, group_size) self.fc2 = QuantizedTensorModule(bits, group_size) - def set_properties(self, quant_type: str): - self.gate_proj.set_properties(quant_type) - self.up_proj.set_properties(quant_type) - self.down_proj.set_properties(quant_type) - self.fc1.set_properties(quant_type) - self.fc2.set_properties(quant_type) - -class QuantizedChatglm3MLP: - def __init__(self, bits, group_size): - self.dense_4h_to_h = QuantizedTensorModule(bits, group_size) - self.dense_h_to_4h = QuantizedTensorModule(bits, group_size) - - def set_properties(self, quant_type: str): - self.dense_4h_to_h.set_properties(quant_type) - self.dense_h_to_4h.set_properties(quant_type) class QuantizedDecoderLayer: def __init__(self, layer_id, bits, group_size): @@ -121,27 +81,9 @@ def __init__(self, layer_id, bits, group_size): def is_empty(self): return self.input_layernorm.weight is None - def set_properties(self, quant_type: str): - self.self_attn.set_properties(quant_type) - self.mlp.set_properties(quant_type) - -class QuantizedChatglm3EncoderLayer: - def __init__(self, layer_id, bits, group_size): - self.layer_id = layer_id - self.input_layernorm = TensorModule() - self.self_attention = QuantizedChatglm3Attention(bits, group_size) - self.post_attention_layernorm = TensorModule() - self.mlp = QuantizedChatglm3MLP(bits, group_size) - - def is_empty(self): - return self.input_layernorm.weight is None - - def set_properties(self, quant_type: str): - self.self_attention.set_properties(quant_type) - self.mlp.set_properties(quant_type) class QuantizedModel: - def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str): + def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers): self.quant_type = quant_type self.embedding = TensorModule() self.final_norm = TensorModule() @@ -149,22 +91,16 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in self.layers = {} self.num_layers = num_layers - q_layer_cls = QuantizedDecoderLayer - # if model_type == "ChatGLMForConditionalGeneration": - # q_layer_cls = QuantizedChatglm3EncoderLayer - # print(q_layer_cls) - layer_id = 0 for weight_file in os.listdir(input_path): if weight_file.endswith(".safetensors"): - module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size)) + module = self.layers.setdefault( + layer_id, QuantizedDecoderLayer(layer_id, bits, group_size) + ) weights = load_file(os.path.join(input_path, weight_file)) - for name, _ in weights.items(): - print(name) # Map weights to modules for name, tensor in weights.items(): - print(name) if tensor.dtype == torch.bfloat16: # Cast bfloat16 to float32 since NumPy does not support bfloat16 tensor = tensor.to(torch.float32) @@ -190,7 +126,10 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in if curr_layer_id != layer_id: # Switch layer module used layer_id = curr_layer_id - module = self.layers.setdefault(layer_id, q_layer_cls(layer_id, bits, group_size)) + module = self.layers.setdefault( + layer_id, + QuantizedDecoderLayer(layer_id, bits, group_size), + ) # Map weights and biases of norm, attention, and feed-forward network # Graph order is input_layernorm --> q_proj/k_proj/v_proj --> o_proj --> post_attention_layernorm --> gate_proj/up_proj --> down_proj @@ -249,21 +188,6 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in elif bool(re.match(r"^model.layers\.\d+\.self_attn.v_proj\.bias$", name)): # model.layers.layer_id.self_attn.v_proj.bias module.self_attn.v_proj.bias = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qweight$", name)): - # # model.layers.layer_id.self_attention.query_key_value.qweight - # module.self_attention.query_key_value.qweight = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.scales$", name)): - # # model.layers.layer_id.self_attention.query_key_value.scales - # module.self_attention.query_key_value.scales = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.qzeros$", name)): - # # model.layers.layer_id.self_attention.query_key_value.qzeros - # module.self_attention.query_key_value.qzeros = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.g_idx$", name)): - # # model.layers.layer_id.self_attention.query_key_value.g_idx - # module.self_attention.query_key_value.g_idx = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention.query_key_value\.bias$", name)): - # # model.layers.layer_id.self_attention.query_key_value.bias - # module.self_attention.query_key_value.bias = tensor elif bool(re.match(r"^model.layers\.\d+\.(self_attn.o_proj|self_attention.dense)\.qweight$", name)): # model.layers.layer_id.self_attn.o_proj.qweight # model.layers.layer_id.self_attention.dense.qweight @@ -284,57 +208,12 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in # model.layers.layer_id.self_attn.o_proj.bias # model.layers.layer_id.self_attention.dense.bias module.self_attn.o_proj.bias = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qweight$", name)): - # # model.layers.layer_id.self_attention.dense.qweight - # module.self_attention.dense.qweight = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.scales$", name)): - # # model.layers.layer_id.self_attention.dense.scales - # module.self_attention.dense.scales = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.qzeros$", name)): - # # model.layers.layer_id.self_attention.dense.qzeros - # module.self_attention.dense.qzeros = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.g_idx$", name)): - # # model.layers.layer_id.self_attention.dense.g_idx - # module.self_attention.dense.g_idx = tensor - # elif bool(re.match(r"^model.layers\.\d+\.self_attention\.dense\.bias$", name)): - # # model.layers.layer_id.self_attention.dense.bias - # module.self_attention.dense.bias = tensor elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.weight$", name)): # model.layers.layer_id.post_attention_layernorm.weight module.post_attention_layernorm.weight = tensor elif bool(re.match(r"^model.layers\.\d+\.post_attention_layernorm\.bias$", name)): # model.layers.layer_id.post_attention_layernorm.bias module.post_attention_layernorm.bias = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qweight$", name)): - # # model.layers.layer_id.mlp.dense_4h_to_h.qweight - # module.mlp.dense_4h_to_h.qweight = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.scales$", name)): - # # model.layers.layer_id.mlp.dense_4h_to_h.scales - # module.mlp.dense_4h_to_h.scales = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.qzeros$", name)): - # # model.layers.layer_id.mlp.dense_4h_to_h.qzeros - # module.mlp.dense_4h_to_h.qzeros = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.g_idx$", name)): - # # model.layers.layer_id.mlp.dense_4h_to_h.g_idx - # module.mlp.dense_4h_to_h.g_idx = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_4h_to_h\.bias$", name)): - # # model.layers.layer_id.mlp.dense_4h_to_h.bias - # module.mlp.dense_4h_to_h.bias = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qweight$", name)): - # # model.layers.layer_id.mlp.dense_h_to_4h.qweight - # module.mlp.dense_h_to_4h.qweight = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.scales$", name)): - # # model.layers.layer_id.mlp.dense_h_to_4h.scales - # module.mlp.dense_h_to_4h.scales = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.qzeros$", name)): - # # model.layers.layer_id.mlp.dense_h_to_4h.qzeros - # module.mlp.dense_h_to_4h.qzeros = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.g_idx$", name)): - # # model.layers.layer_id.mlp.dense_h_to_4h.g_idx - # module.mlp.dense_h_to_4h.g_idx = tensor - # elif bool(re.match(r"^model.layers\.\d+\.mlp\.dense_h_to_4h\.bias$", name)): - # # model.layers.layer_id.mlp.dense_h_to_4h.bias - # module.mlp.dense_h_to_4h.bias = tensor elif bool(re.match(r"^model.layers\.\d+\.mlp.gate_proj\.qweight$", name)): # model.layers.layer_id.mlp.gate_proj.qweight module.mlp.gate_proj.qweight = tensor @@ -674,8 +553,8 @@ def pack_ort_format(self, module, intweight): class AWQModel(QuantizedModel): - def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type: str): - super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) + def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers): + super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) # Unpack and repack all `QuantizedTensorModule` classes in model for i, layer in enumerate(self.layers): @@ -686,7 +565,6 @@ def __init__(self, quant_type, input_path, bits, group_size, q_size, kv_size, in # Unpack and repack all `QuantizedTensorModule` classes in attention self_attn = getattr(layer, "self_attn", None) or getattr(layer, "self_attention", None) for name, q_tensors in self_attn.__dict__.items(): - print(name) if isinstance(q_tensors, QuantizedTensorModule) and q_tensors.qweight is not None: self.unpack(q_tensors) self.repack(q_tensors) @@ -743,8 +621,8 @@ def reverse_reorder_tensor(self, tensor, bits): class GPTQModel(QuantizedModel): - def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type: str): - super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) + def __init__(self, quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers): + super().__init__(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) # Unpack and repack all `QuantizedTensorModule` classes in model for i, layer in enumerate(self.layers): @@ -801,16 +679,16 @@ def __init__(self, module): class QuantModel: @staticmethod - def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type:str): + def from_pretrained(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers): """ Unpack quantized weights in PyTorch models, store them in a standard format, and repack them into ONNX Runtime's format. Also performs any pre-processing and post-processing when unpacking the quantized weights. """ if quant_type == "awq": - model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers, model_type) + model = AWQModel(quant_type, input_path, bits, group_size, q_size, kv_size, intermediate_size, num_layers) elif quant_type == "gptq": - model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers, model_type) + model = GPTQModel(quant_type, input_path, bits, group_size, use_g_idx, q_size, kv_size, intermediate_size, num_layers) else: raise NotImplementedError(f"The {quant_type} quantized model is not currently supported.") From 938535cc3b4b6164bd884fe3483855c4ffaf94e1 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 00:47:10 +0000 Subject: [PATCH 09/17] minor_updates --- src/python/py/models/builder.py | 6 ++++++ src/python/py/models/quantized_model.py | 1 + 2 files changed, 7 insertions(+) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index ec019b324..448c2308a 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1,5 +1,6 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. +# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- @@ -2677,6 +2678,8 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): def make_attention(self, layer_id, attention, root_input, **kwargs): if self.quant_type is None: super().make_attention_unpacked(layer_id, attention, root_input, **kwargs) + # Add dummy rotary_emb attribute + attention.rotary_emb = type("RotaryEmbedding", (object,), {'content':{}})() return super().make_attention(layer_id, attention, root_input, **kwargs) @@ -2685,6 +2688,9 @@ def make_mlp_proj(self, layer_id, mlp, root_input): super().make_mlp_unpacked(layer_id, mlp, root_input) super().make_mlp_proj(layer_id, mlp, root_input) + def make_layer(self, layer_id, layer): + layer.self_attn = layer.self_attn if hasattr(layer, 'self_attn') else layer.self_attention + super().make_layer(layer_id, layer) def check_extra_options(kv_pairs): if "use_8bits_moe" in kv_pairs: diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index 108212952..9afe5a6eb 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -1,5 +1,6 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. +# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- From 5e39727d3e874cc03586d8a6d4a5937a542c36f9 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 00:49:07 +0000 Subject: [PATCH 10/17] comment --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 448c2308a..30710d374 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -2761,7 +2761,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid onnx_model = Phi3VModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) - elif config.architectures[0] == "ChatGLMForConditionalGeneration": + elif config.architectures[0] == "ChatGLMForConditionalGeneration" or config.architectures[0] == "ChatGLMModel": config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: From cfac49c5100639945ca972f19af2badde6df8778 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 01:39:16 +0000 Subject: [PATCH 11/17] correct_usernames --- src/python/py/models/builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 30710d374..c84ce8d88 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -2762,6 +2762,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid elif config.architectures[0] == "Qwen2ForCausalLM": onnx_model = QwenModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) elif config.architectures[0] == "ChatGLMForConditionalGeneration" or config.architectures[0] == "ChatGLMModel": + # Quantized ChatGLM model has ChatGLMForConditionalGeneration as architecture whereas HF model as the latter config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: From e70dcfb6dbcf4cc039ce759efd5097133de56aac Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 21:58:02 +0000 Subject: [PATCH 12/17] cleanup --- src/python/py/models/builder.py | 4 ++-- src/python/py/models/quantized_model.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index c84ce8d88..89163d544 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1,9 +1,10 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. -# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved + """ Run this script to create the desired ONNX model. """ @@ -32,7 +33,6 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False - # self.multi_query_group_num = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else 1 # group_num as 1 is vanilla Multi-query attention https://arxiv.org/pdf/2305.13245 self.num_attn_heads = config.num_attention_heads self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index 9afe5a6eb..1ee85322e 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -1,9 +1,9 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved -# Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +# Licensed under the MIT License. See License.txt in the project root for """ A set of Python classes to unpack the quantized weights and repack them in ONNX Runtime's standard format. From 44a5178b797b7a89ee853128170fdd5e39b0e142 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 22:35:31 +0000 Subject: [PATCH 13/17] fixed_license_headers --- src/python/py/models/builder.py | 1 - src/python/py/models/quantized_model.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 89163d544..89f89897a 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -4,7 +4,6 @@ # license information. # -------------------------------------------------------------------------- # Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved - """ Run this script to create the desired ONNX model. """ diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py index 1ee85322e..2d15cb5fa 100644 --- a/src/python/py/models/quantized_model.py +++ b/src/python/py/models/quantized_model.py @@ -1,9 +1,9 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. -# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved +# Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- -# Licensed under the MIT License. See License.txt in the project root for +# Modifications Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved """ A set of Python classes to unpack the quantized weights and repack them in ONNX Runtime's standard format. From d8eb982686cf2ff285ad94de24a19434663fec38 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Tue, 24 Sep 2024 22:51:31 +0000 Subject: [PATCH 14/17] rm_unused_make_split --- src/python/py/models/builder.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 89f89897a..b31f9425c 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -530,13 +530,6 @@ def make_gather(self, name, inputs, axis): self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis) self.make_value_info(output, TensorProto.INT64, shape=[]) - def make_split(self, name, inputs, dtype, shape, axis, num_splits): - # Splits the input tensor into num_splits based on the axis and shape - outputs = [f"{name}/output_{i}" for i in range(num_splits)] - self.make_node("Split", inputs=[inputs], outputs=outputs, name=name, axis=axis) - for output in outputs: - self.make_value_info(output, dtype, shape=shape) - def make_reshape(self, name, inputs, dtype, shape): output = f"{name}/output_0" self.make_node("Reshape", inputs=inputs, outputs=[output], name=name) From e116fa4ee47f041f6590276ffee78f518d221747 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 26 Sep 2024 17:17:36 +0000 Subject: [PATCH 15/17] refactor_config --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index b31f9425c..02c0799a6 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -24,7 +24,7 @@ class Model: def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length - self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length + self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1 # default is -1 in GroupQueryAttention kernel self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size self.hidden_size = config.hidden_size From f76867300d35a9e3ee66a2779b965700ea256e73 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Thu, 26 Sep 2024 18:30:26 +0000 Subject: [PATCH 16/17] refactor_context_length_config --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 02c0799a6..0b6272bfc 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -23,7 +23,7 @@ class Model: def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - self.context_length = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else config.seq_length + self.context_length = config.seq_length if hasattr(config, "seq_length") else config.max_position_embeddings self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1 # default is -1 in GroupQueryAttention kernel self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size From 7eee24d67b2a0c42a0c8fec6783c1e72e6072918 Mon Sep 17 00:00:00 2001 From: amd-sudo-sh Date: Fri, 27 Sep 2024 16:39:36 +0000 Subject: [PATCH 17/17] cleanup_based_on_comments --- src/python/py/models/builder.py | 44 +++++++++------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 0b6272bfc..c20db0e3d 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -26,12 +26,9 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.context_length = config.seq_length if hasattr(config, "seq_length") else config.max_position_embeddings self.original_context_length = config.original_max_position_embeddings if hasattr(config, "original_max_position_embeddings") else config.rope_scaling["original_max_position_embeddings"] if hasattr(config, "rope_scaling") and hasattr(config.rope_scaling, "original_max_position_embeddings") else self.context_length self.window_size = config.sliding_window if hasattr(config, "sliding_window") else -1 # default is -1 in GroupQueryAttention kernel - self.intermediate_size = config.intermediate_size if hasattr(config, "intermediate_size") else config.ffn_hidden_size + self.intermediate_size = config.ffn_hidden_size if hasattr(config, "ffn_hidden_size") else config.intermediate_size self.hidden_size = config.hidden_size - self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads - self.num_kv_heads = config.multi_query_group_num if hasattr(config, "multi_query_group_num") else self.num_kv_heads - self.kv_channels = config.kv_channels if hasattr(config, "kv_channels") else self.num_kv_heads - self.multi_query_attention = config.multi_query_attention if hasattr(config, "multi_query_attention") else False + self.num_kv_heads = config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.multi_query_group_num if hasattr(config, "multi_query_group_num") else config.num_attention_heads self.num_attn_heads = config.num_attention_heads self.head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads self.num_layers = int(extra_options["num_hidden_layers"]) if "num_hidden_layers" in extra_options else config.num_hidden_layers if hasattr(config, "num_hidden_layers") else config.num_layers @@ -1789,21 +1786,17 @@ def make_model(self, input_path): self.layernorm_attrs["root_input"] = "inputs_embeds" self.layernorm_attrs["skip_input"] = "inputs_embeds" - elif module.__class__.__name__.endswith("DecoderLayer") and self.layer_id < self.num_layers: + elif module.__class__.__name__.endswith("DecoderLayer") or module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers: # Each decoder layer of model print(f"Reading decoder layer {self.layer_id}") self.make_layer(self.layer_id, module) self.layer_id += 1 - elif module.__class__.__name__.endswith("GLMBlock") and self.layer_id < self.num_layers: - print(f"Reading decoder layer {self.layer_id}") - self.make_layer(self.layer_id, module) - self.layer_id += 1 - elif self.layer_id == self.num_layers and self.has_final_norm(module, model): # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm) print("Reading final norm") self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm") + elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head): # Checks (Hugging Face logic) or (GGUF logic) if not self.exclude_lm_head: @@ -1814,12 +1807,13 @@ def make_model(self, input_path): del model def has_final_norm(self, module, model): - # Hugging Face names - hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm - # GGUF names - gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm - return hf_norm or hf_final_layernorm or gguf_final_norm + # Hugging Face names + hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm + hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm + hf_transformer_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm + # GGUF names + gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm + return hf_norm or hf_final_layernorm or hf_transformer_final_layernorm or gguf_final_norm def make_preprocessing_nodes(self): self.make_attention_mask_reformatting() @@ -2645,24 +2639,12 @@ def make_layer(self, layer_id, layer): class ChatGLMModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) - # self.input_shapes["position_ids"] = [1] # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor - self.layernorm_attrs["simple"] = True # RMSNorm in ChatGLM self.rotemb_attrs["num_heads"] = self.num_attn_heads self.rotemb_attrs["partial_rotary_factor"] = 0.5 # Line 755 of modeling_chatglm.py check self.rotary_pos_emb declaration self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) self.rotemb_attrs["interleaved"] = 1 - self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = True, False self.attention_attrs["use_rotemb_in_attn"] = True self.attention_attrs["use_packed_matmul"] = True - self.attention_attrs["op_type"] = "GroupQueryAttention" if self.multi_query_attention else self.attention_attrs["op_type"] - - def has_final_norm(self, module, model): - # Hugging Face names - hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm - hf_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm - # GGUF names - gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm - return hf_norm or hf_final_layernorm or gguf_final_norm def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) @@ -2758,9 +2740,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid config.hidden_act = "swiglu" onnx_model = ChatGLMModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options) else: - raise NotImplementedError( - f"The {hf_name} model is not currently supported. Got {config}" - ) + raise NotImplementedError(f"The {hf_name} model is not currently supported.") # Make ONNX model onnx_model.make_model(input_path)