Fix SigLIP casual mask bug #25360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

nenad1002 merged 10 commits into main from nebanfic/fix-clip-bug

Jul 15, 2025

onnxruntime/python/tools/transformers/fusion_attention_clip.py

-Original file line number
+Diff line change
@@ Expand Up @@
             attention_last_node = reshape_qkv
             add_qk = ""
+            causal_mask_nodes_1 = None
+            causal_mask_nodes_2 = None
             if add_mask is not None:
-                # 4D Add after Q x K'
-                add_qk_nodes = self.model.match_parent_path(
-                    add_mask,
-                    [
-                        "Where",
-                        "Sub",
-                        "Cast",
-                        "Expand",
-                        "Unsqueeze",
-                        "Unsqueeze",
-                        "Reshape",
-                        "Reshape",
-                        "Cast",
-                    ],
-                    [1, 2, 1, 0, 0, 0, 0, 0, 0],
-                )
-                if add_qk_nodes is not None:
+                if add_mask.input[1] == "attention_mask":
                     add_qk = add_mask.input[1]
                 else:
-                    # Here we do not match the whole subgraph since it is very complex. Instead, we just check whether a key path
-                    # of computing causal mask.
-                    causal_mask_nodes_1 = self.model.match_parent_path(
-                        add_mask,
-                        ["Concat", "Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
-                        [causal_mask_input_index, 0, 0, 0, 0, 0],
-                    )
-                    # If the model is exported with batch_size == 1, there is no Concat node
-                    causal_mask_nodes_2 = self.model.match_parent_path(
+                    # 4D Add after Q x K'
+                    add_qk_nodes = self.model.match_parent_path(
                         add_mask,
-                        ["Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
-                        [causal_mask_input_index, 0, 0, 0, 0],
+                        [
+                            "Where",
+                            "Sub",
+                            "Cast",
+                            "Expand",
+                            "Unsqueeze",
+                            "Unsqueeze",
+                            "Reshape",
+                            "Reshape",
+                            "Cast",
+                        ],
+                        [1, 2, 1, 0, 0, 0, 0, 0, 0],
                     )
-                    if causal_mask_nodes_1 is None and causal_mask_nodes_2 is None:
-                        logger.debug("fuse_attention: failed to match causal mask subgraph")
-                        return
+                    if add_qk_nodes is not None:
+                        add_qk = add_mask.input[1]
+                    else:
+                        # Here we do not match the whole subgraph since it is very complex. Instead, we just check whether a key path
+                        # of computing causal mask.
+                        causal_mask_nodes_1 = self.model.match_parent_path(
+                            add_mask,
+                            ["Concat", "Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
+                            [causal_mask_input_index, 0, 0, 0, 0, 0],
+                        )
+                        # If the model is exported with batch_size == 1, there is no Concat node
+                        causal_mask_nodes_2 = self.model.match_parent_path(
+                            add_mask,
+                            ["Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
+                            [causal_mask_input_index, 0, 0, 0, 0],
+                        )
+                        if causal_mask_nodes_1 is None and causal_mask_nodes_2 is None:
+                            logger.debug("fuse_attention: failed to match causal mask subgraph")
+                            return
             new_node = self.create_attention_node(
                 mask_index=None,
@@ Expand All @@
                 output=attention_last_node.output[0],
                 add_qk_str=add_qk,
                 scale=None,
-                causal=(add_mask is not None),
+                causal=(causal_mask_nodes_1 is not None) or (causal_mask_nodes_2 is not None),
             )
             if new_node is None:
                 logger.debug("fuse_attention: failed to create fused node")
@@ Expand Down @@

onnxruntime/test/python/transformers/test_data/models/phi-4-v-instruct-vision-attention.onnx

Binary file not shown.

onnxruntime/test/python/transformers/test_phi_vision.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -149,7 +149,7 @@ def __init__(self): @@
             self.attn = PhiVCLIPAttention()
             self.ln = torch.nn.LayerNorm(20, eps=1e-05)
-        def forward(self, x):
+        def forward(self, x, attention_mask=None):
             #      SkipLayerNorm ------+
             #            |             |
             #        Attention         |
@@ Expand All / @@ -163,8 +163,7 @@ def forward(self, x): @@
             x = self.ln(x)
             residual = x
-            # Attention + MatMul
-            x = self.attn(x)
+            x = self.attn(x, attention_mask=attention_mask)
             # SkipLayerNorm
             x = residual + x
@@ Expand Down Expand Up @@
                 )
         def export(self, model, inputs):
-            torch.onnx.export(
-                model,
-                args=inputs,
-                f=os.path.join(os.path.dirname(__file__), "export.onnx"),
-                export_params=True,
-                opset_version=14,
-                do_constant_folding=True,
-            )
+            path = os.path.join(os.path.dirname(__file__), "export.onnx")
+            if len(inputs) == 2:
+                torch.onnx.export(
+                    model,
+                    args=inputs,
+                    f=path,
+                    export_params=True,
+                    opset_version=14,
+                    do_constant_folding=True,
+                    input_names=["input", "attention_mask"],
+                    dynamic_axes={
+                        "input": {0: "batch", 1: "seq"},
+                        "attention_mask": {0: "batch", 2: "seq", 3: "seq"},
+                    },
+                )
+            else:
+                torch.onnx.export(
+                    model,
+                    args=inputs,
+                    f=path,
+                    export_params=True,
+                    opset_version=14,
+                    do_constant_folding=True,
+                )
         def tearDown(self):
             path = os.path.join(os.path.dirname(__file__), "export.onnx")
@@ Expand Down Expand Up / @@ -249,6 +265,38 @@ def test_phi_vision_attention(self): @@
             )
             self.verify_fusion(optimized_model, "phi-3.5-v-instruct-vision-attention.onnx")
+        def test_phi_vision_attention_with_mask(self):
+            model = PhiVCLIPAttentionAndLayerNorm()
+            batch, seq_len, dim = 1, 2, 20
+            mask = torch.zeros(batch, 1, seq_len, seq_len)
+            mask[:, 1:] = float("-inf")
+            inputs = (torch.randn(batch, seq_len, dim), mask)
+            self.export(model, inputs)
+            original_model = onnx.load(os.path.join(os.path.dirname(__file__), "export.onnx"))
+            options = FusionOptions("clip")
+            optimized_model = optimize_model(
+                original_model,
+                model_type="clip",
+                num_heads=2,
+                hidden_size=20,
+                optimization_options=options,
+                opt_level=0,
+                use_gpu=True,
+            )
+            self.verify_fusion(optimized_model, "phi-4-v-instruct-vision-attention.onnx")
+            graph = optimized_model.model.graph
+            attention_node = next((n for n in graph.node if n.name == "Attention_0"), None)
+            self.assertIsNotNone(attention_node, "Could not find the Attention fused node")
+            attr_names = [attr.name for attr in attention_node.attribute]
+            self.assertNotIn(
+                "unidirectional",
+                attr_names,
+                f"The attention node should not have a 'unidirectional' attribute: {attr_names}",
+            )
     if __name__ == "__main__":
         unittest.main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix SigLIP casual mask bug #25360

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!