microsoft
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎noxfile.py‎
Lines changed: 1 addition & 1 deletion b/‎noxfile.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxscript/function_libs/torch_lib/ops/core.py‎
Lines changed: 31 additions & 15 deletions b/‎onnxscript/function_libs/torch_lib/ops/core.py‎
Lines changed: 31 additions & 15 deletions
diff --git a/‎onnxscript/function_libs/torch_lib/ops/nn.py‎
Lines changed: 72 additions & 5 deletions b/‎onnxscript/function_libs/torch_lib/ops/nn.py‎
Lines changed: 72 additions & 5 deletions
diff --git a/‎onnxscript/function_libs/torch_lib/ops/prims.py‎
Lines changed: 23 additions & 2 deletions b/‎onnxscript/function_libs/torch_lib/ops/prims.py‎
Lines changed: 23 additions & 2 deletions
@@ -1 +1 @@
-0.5.0
+0.5.2
@@ -42,7 +42,7 @@
     "packaging",
     "protobuf",
 )
-ONNX_IR = "onnx_ir==0.1.7"
+ONNX_IR = "onnx_ir==0.1.9"
 ONNX_IR_MAIN = "git+https://github.com/onnx/ir-py.git@main#egg=onnx_ir"
 
 
 
@@ -925,16 +925,21 @@ def aten_atan(self: TFloat) -> TFloat:
     return op.Atan(self)
 
 
-@torch_op("aten::atan2")
+@torch_op("aten::atan2", trace_only=True)
 def aten_atan2(self: TFloat, other: TFloat) -> TFloat:
     """atan2(Tensor self, Tensor other) -> Tensor"""
 
     # self is y, and other is x on coordinate
     slope = op.Div(self, other)
     atan = op.Atan(slope)
+    zero = common_ops.constant(0.0, dtype=self.dtype)
+    pi = common_ops.constant(_MATH_PI, dtype=self.dtype)
 
-    second_third_quadrant = op.Where(self > 0.0, atan + _MATH_PI, atan - _MATH_PI)
-    result = op.Where(other < 0.0, second_third_quadrant, atan)
+    second_third_quadrant = op.Where(op.Greater(self, zero), atan + pi, atan - pi)
+    result = op.Where(op.Less(other, zero), second_third_quadrant, atan)
+
+    # Map NaN to 0 to match PyTorch behavior
+    result = op.Where(op.IsNaN(result), zero, result)
 
     return result
 
@@ -7327,16 +7332,25 @@ def aten_repeat_interleave_self_int(
     self_rank = len(self.shape)
     pos_dim = (dim + self_rank) % self_rank
     unsqueezed = op.Unsqueeze(self, [pos_dim + 1])
-    tiles = [1] * (self_rank + 1)
-    tiles[pos_dim + 1] = repeats
-    tile_repeat = op.Constant(value=ir.tensor(tiles, dtype=INT64.dtype))
-    tiled = op.Tile(unsqueezed, tile_repeat)
+    if isinstance(repeats, int):
+        tiles = [1] * (self_rank + 1)
+        tiles[pos_dim + 1] = repeats
+        tile_repeat = op.Constant(value=ir.tensor(tiles, dtype=INT64.dtype))
+    else:
+        # repeats is a symbolic tensor
+        tile_repeat = op.Concat(
+            op.Constant(value=ir.tensor([1] * pos_dim, dtype=INT64.dtype)),
+            op.Reshape(repeats, op.Constant(value=ir.tensor([-1], dtype=INT64.dtype))),
+            op.Constant(value=ir.tensor([1] * (self_rank - pos_dim), dtype=INT64.dtype)),
+            axis=0,
+        )
+    tiled = op.Expand(unsqueezed, tile_repeat)
     if self_rank == 1:
         return op.Identity(tiled)
     final_shape = op.Concat(
         op.Shape(self, start=0, end=dim),
         op.Constant(value_ints=[-1]),
-        op.Shape(self, start=dim + 1),
+        op.Shape(self, start=pos_dim + 1),
         axis=0,
     )
     return op.Reshape(tiled, final_shape)
@@ -7375,20 +7389,22 @@ def aten_repeat_interleave_Tensor(
     if dim is None:
         # flatten
         self = op.Reshape(self, [-1])
-        rk = 1
+        rank = 1
     else:
-        rk = len(self.shape)
+        rank = len(self.shape)
 
-    if rk > 2:
+    if rank > 2:
         shape_x0 = op.Shape(self, start=0, end=1)
         shape_x = op.Shape(self, start=1)
         self = op.Reshape(self, op.Concat(shape_x0, [-1], axis=0))
-    elif rk == 1:
+    elif rank == 1:
         shape_x = None
         self = op.Reshape(self, [-1, 1])
     else:
-        if rk != 2:
-            raise NotImplementedError(f"rank(self)={rk} not implemented for repeat_interleave")
+        if rank != 2:
+            raise NotImplementedError(
+                f"rank(self)={rank} not implemented for repeat_interleave"
+            )
         shape_x = None
 
     ci = op.CumSum(repeats, [0])
@@ -7401,7 +7417,7 @@ def aten_repeat_interleave_Tensor(
     )
     indices = op.Reshape(srows, [-1])
     values = op.GatherND(self, op.Unsqueeze(indices, [-1]))
-    if rk == 2:
+    if rank == 2:
         return values
     # shape_x is None at this stage.
     assert shape_x is None  # for mypy
 
@@ -1741,6 +1741,64 @@ def _attention_scale(query: TFloat) -> TFloat:
     return scale
 
 
+def _attention_repeat_kv_for_group_query(
+    query: TFloat, key: TFloat, value: TFloat
+) -> Tuple[TFloat, TFloat]:
+    """Expand key and value for group query attention.
+
+    repeat_interleave is applied on key and value to match the number of heads in query.
+
+    Args:
+        query: Tensor of shape [B, q_num_heads, q_S, E]
+        key: Tensor of shape [B, k_num_heads, kv_S, E]
+        value: Tensor of shape [B, v_num_heads, kv_S, E]
+
+    Returns:
+        Tuple of (expanded_key, expanded_value) where:
+            - expanded_key: Tensor of shape [B, q_num_heads, kv_S, E]
+            - expanded_value: Tensor of shape [B, q_num_heads, kv_S, E
+    """
+
+    assert (
+        query.shape[1] > key.shape[1] == value.shape[1] and query.shape[1] % key.shape[1] == 0
+    ), (
+        "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
+    )
+
+    # NOTE: QKV are expected to be 4D tensors
+
+    batch_size = op.Shape(query, start=0, end=1)  # [B]
+    q_num_heads = op.Shape(query, start=1, end=2)  # [Hq]
+    kv_num_heads = op.Shape(key, start=1, end=2)  # [Hk]
+    qk_head_size = op.Shape(key, start=3, end=4)  # [Dk]
+    v_head_size = op.Shape(value, start=3, end=4)  # [Dv]
+    new_kv_seq_len = op.Shape(key, start=2, end=3)  # [T]
+
+    interleave_dim = op.Div(q_num_heads, kv_num_heads)  # Hq / Hk
+    two = op.Constant(value_int=2)
+    k_unsqueezed = op.Unsqueeze(key, two)  # [B, Hk, 1, T, Dk]
+    v_unsqueezed = op.Unsqueeze(value, two)  # [B, Hv, 1, T, Dv]
+
+    k_expand_shape = op.Concat(
+        batch_size, kv_num_heads, interleave_dim, new_kv_seq_len, qk_head_size, axis=0
+    )
+    k_expand = op.Expand(k_unsqueezed, k_expand_shape)
+    v_expand_shape = op.Concat(
+        batch_size, kv_num_heads, interleave_dim, new_kv_seq_len, v_head_size, axis=0
+    )
+    v_expand = op.Expand(v_unsqueezed, v_expand_shape)
+
+    k_attention_shape = op.Concat(
+        batch_size, q_num_heads, new_kv_seq_len, qk_head_size, axis=0
+    )
+    v_attention_shape = op.Concat(batch_size, q_num_heads, new_kv_seq_len, v_head_size, axis=0)
+
+    expanded_key = op.Reshape(k_expand, k_attention_shape)
+    expanded_value = op.Reshape(v_expand, v_attention_shape)
+
+    return expanded_key, expanded_value
+
+
 @torch_op("aten::scaled_dot_product_attention", trace_only=True)
 def aten_scaled_dot_product_attention(
     query: TFloat,
@@ -1772,8 +1830,8 @@ def aten_scaled_dot_product_attention(
         "is_causal and attn_mask cannot be set at the same time"
     )
 
-    assert not enable_gqa, (
-        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+    assert len(query.shape) == 4 and len(key.shape) == 4 and len(value.shape) == 4, (
+        "only 4D query, key, and value are supported"
     )
 
     # Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -1784,6 +1842,13 @@ def aten_scaled_dot_product_attention(
     if is_causal:
         attn_mask = _causal_attention_mask(query, key)
 
+    if enable_gqa:
+        key, value = _attention_repeat_kv_for_group_query(query, key, value)
+    else:
+        assert query.shape[1] == key.shape[1] == value.shape[1], (
+            "SDPA (MHA) requires q_num_heads = kv_num_heads"
+        )
+
     if attn_mask is None:
         return _aten_scaled_dot_product_attention_no_mask_onnx(
             query, key, value, scale, dropout_p
@@ -1981,9 +2046,8 @@ def aten_scaled_dot_product_attention_bool_mask(
     assert (not is_causal) or (is_causal and attn_mask is None), (
         "is_causal and attn_mask cannot be set at the same time"
     )
-
-    assert not enable_gqa, (
-        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+    assert len(query.shape) == 4 and len(key.shape) == 4 and len(value.shape) == 4, (
+        "only 4D query, key, and value are supported"
     )
 
     if scale is None:
@@ -1997,6 +2061,9 @@ def aten_scaled_dot_product_attention_bool_mask(
             query, key, value, attn_mask, scale, dropout_p
         )
 
+    if enable_gqa:
+        key, value = _attention_repeat_kv_for_group_query(query, key, value)
+
     if attn_mask is None:
         return _aten_scaled_dot_product_attention_no_mask_onnx(
             query, key, value, scale, dropout_p
 
@@ -176,12 +176,33 @@ def prims_bitwise_xor(self: TensorType, other: TensorType) -> TensorType:
     raise NotImplementedError()
 
 
+@torch_op("prims::broadcast_in_dim", trace_only=True)
 def prims_broadcast_in_dim(
-    a: TensorType, shape: INT64, broadcast_dimensions: Sequence[int]
+    a: TensorType, shape: Sequence[INT64], broadcast_dimensions: Sequence[int]
 ) -> TensorType:
     """broadcast_in_dim(Tensor(a) a, SymInt[] shape, int[] broadcast_dimensions) -> Tensor(a)"""
 
-    raise NotImplementedError()
+    target_rank = len(shape)
+
+    if not broadcast_dimensions:
+        # Special case: no broadcast dimensions - all target dims should be 1
+        return op.Expand(a, common_ops.merge_dims(shape))
+
+    # Create base shape of all 1s
+    ones = [1] * target_rank
+
+    # For each broadcast dimension, we'll replace the 1 with the actual input dimension
+    # Since broadcast_dimensions is compile-time known, we can do this with individual operations
+    intermediate_shape = ones
+
+    for i, broadcast_dim in enumerate(broadcast_dimensions):
+        # Get the input dimension value
+        input_dim_value = op.Shape(a, start=i, end=i + 1)
+        intermediate_shape[broadcast_dim] = input_dim_value
+
+    # Reshape input to intermediate shape and expand to target
+    reshaped = op.Reshape(a, common_ops.merge_dims(intermediate_shape))
+    return op.Expand(reshaped, shape)
 
 
 def prims_cat(tensors: Sequence[TensorType], dim: int) -> TensorType:
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`	`"packaging",`
`43`	`43`	`"protobuf",`
`44`	`44`	`)`
`45`		`-ONNX_IR = "onnx_ir==0.1.7"`
	`45`	`+ONNX_IR = "onnx_ir==0.1.9"`
`46`	`46`	`ONNX_IR_MAIN = "git+https://github.com/onnx/ir-py.git@main#egg=onnx_ir"`
`47`	`47`
`48`	`48`