diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md index 60d5ecf3e4..3af38c3894 100644 --- a/src/python/py/models/README.md +++ b/src/python/py/models/README.md @@ -20,6 +20,7 @@ This folder contains the model builder for quickly creating optimized and quanti - [Exclude Language Modeling Head](#exclude-language-modeling-head) - [Include Last Hidden States Output](#include-last-hidden-states-output) - [Enable Shared Embeddings](#enable-shared-embeddings) + - [Disable QKV Projections Fusion](#disable-qkv-projections-fusion) - [Enable CUDA Graph](#enable-cuda-graph) - [Use 8 Bits Quantization in QMoE](#use-8-bits-quantization-in-qmoe) - [Use QDQ Pattern for Quantization](#use-qdq-pattern-for-quantization) @@ -253,6 +254,18 @@ python3 -m onnxruntime_genai.models.builder -m model_name -o path_to_output_fold python3 builder.py -m model_name -o path_to_output_folder -p fp16 -e cuda --extra_options shared_embeddings=true ``` +#### Disable QKV Projections Fusion + +This scenario is for when you want to keep Q/K/V projections in the attention layer separate instead of fusing them into a single packed MatMul operation. + +``` +# From wheel: +python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options disable_qkv_fusion=true + +# From source: +python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options disable_qkv_fusion=true +``` + #### Enable CUDA Graph This scenario is for when you want to enable CUDA graph for your ONNX model. diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 02635e4060..9cf463af91 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -61,6 +61,7 @@ def check_extra_options(kv_pairs, execution_provider): "use_cuda_bf16", "shared_embeddings", "hf_remote", + "disable_qkv_fusion", ] for key in bools: if key in kv_pairs: diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index 0bb545e501..205cdec652 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -490,11 +490,14 @@ def make_attention_init(self): # Some EPs don't support packed Q/K/V for GQA yet # Packed MatMul with LoRA/QLoRA is not currently supported + # use_packed_matmul can be overrided by upstream quantization choice + # (e.g., when q_proj, k_proj, v_proj have different quantization settings) self.attention_attrs["use_packed_matmul"] = ( self.ep not in ["dml"] and not self.matmul_attrs["use_lora"] and not self.attention_attrs["q_norm"] and not self.attention_attrs["k_norm"] + and not self.extra_options.get("disable_qkv_fusion", False) ) # Some EPs don't support fusing rotary embeddings inside GQA yet