diff --git a/ivy/data_classes/array/layers.py b/ivy/data_classes/array/layers.py
index 550a225b7e798..e9169b2ee0cbd 100644
--- a/ivy/data_classes/array/layers.py
+++ b/ivy/data_classes/array/layers.py
@@ -409,6 +409,12 @@ def multi_head_attention(
         in_proj_bias: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
         out_proj_bias: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
         is_causal: bool = False,
+        key_padding_mask: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+        bias_k: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+        bias_v: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+        static_k: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+        static_v: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+        add_zero_attn: bool = False,
         return_attention_weights: bool = False,
         average_attention_weights: bool = True,
         dropout: float = 0.0,
@@ -430,6 +436,12 @@ def multi_head_attention(
             in_proj_bias=in_proj_bias,
             out_proj_bias=out_proj_bias,
             is_causal=is_causal,
+            key_padding_mask=key_padding_mask,
+            bias_k=bias_k,
+            bias_v=bias_v,
+            static_k=static_k,
+            static_v=static_v,
+            add_zero_attn=add_zero_attn,
             return_attention_weights=return_attention_weights,
             average_attention_weights=average_attention_weights,
             dropout=dropout,
diff --git a/ivy/data_classes/container/layers.py b/ivy/data_classes/container/layers.py
index b56ed5c84a9d6..d071b6fa7be89 100644
--- a/ivy/data_classes/container/layers.py
+++ b/ivy/data_classes/container/layers.py
@@ -1055,6 +1055,14 @@ def _static_multi_head_attention(
             Union[ivy.Array, ivy.NativeArray, ivy.Container]
         ] = None,
         is_causal: Union[bool, ivy.Container] = False,
+        key_padding_mask: Optional[
+            Union[ivy.Array, ivy.NativeArray, ivy.Container]
+        ] = None,
+        bias_k: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        bias_v: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        static_k: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        static_v: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        add_zero_attn: Union[bool, ivy.Container] = False,
         return_attention_weights: Union[bool, ivy.Container] = False,
         average_attention_weights: Union[bool, ivy.Container] = True,
         dropout: Union[float, ivy.Container] = 0.0,
@@ -1081,6 +1089,12 @@ def _static_multi_head_attention(
             in_proj_bias=in_proj_bias,
             out_proj_bias=out_proj_bias,
             is_causal=is_causal,
+            key_padding_mask=key_padding_mask,
+            bias_k=bias_k,
+            bias_v=bias_v,
+            static_k=static_k,
+            static_v=static_v,
+            add_zero_attn=add_zero_attn,
             return_attention_weights=return_attention_weights,
             average_attention_weights=average_attention_weights,
             dropout=dropout,
@@ -1123,6 +1137,14 @@ def multi_head_attention(
             Union[ivy.Array, ivy.NativeArray, ivy.Container]
         ] = None,
         is_causal: Union[bool, ivy.Container] = False,
+        key_padding_mask: Optional[
+            Union[ivy.Array, ivy.NativeArray, ivy.Container]
+        ] = None,
+        bias_k: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        bias_v: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        static_k: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        static_v: Optional[Union[ivy.Array, ivy.NativeArray, ivy.Container]] = None,
+        add_zero_attn: Union[bool, ivy.Container] = False,
         return_attention_weights: Union[bool, ivy.Container] = False,
         average_attention_weights: Union[bool, ivy.Container] = True,
         dropout: Union[float, ivy.Container] = 0.0,
@@ -1148,6 +1170,12 @@ def multi_head_attention(
             in_proj_bias=in_proj_bias,
             out_proj_bias=out_proj_bias,
             is_causal=is_causal,
+            key_padding_mask=key_padding_mask,
+            bias_k=bias_k,
+            bias_v=bias_v,
+            static_k=static_k,
+            static_v=static_v,
+            add_zero_attn=add_zero_attn,
             return_attention_weights=return_attention_weights,
             average_attention_weights=average_attention_weights,
             dropout=dropout,
diff --git a/ivy/functional/backends/torch/layers.py b/ivy/functional/backends/torch/layers.py
index 8bb277fd2bb44..f447ebd3c151e 100644
--- a/ivy/functional/backends/torch/layers.py
+++ b/ivy/functional/backends/torch/layers.py
@@ -6,11 +6,129 @@
 
 # local
 import ivy
-from ivy.func_wrapper import with_unsupported_dtypes
+from ivy.func_wrapper import with_unsupported_dtypes, with_supported_dtypes
 from . import backend_version
 from ivy.functional.ivy.layers import _handle_padding, _deconv_length
 
 
+@with_supported_dtypes(
+    {"2.0.1 and below": ("float32", "float64", "complex")},
+    backend_version,
+)
+def multi_head_attention(
+    query: torch.Tensor,
+    /,
+    *,
+    key: torch.Tensor = None,
+    value: torch.Tensor = None,
+    batch_first: bool = True,
+    num_heads: Optional[int] = 8,
+    scale: Optional[float] = None,
+    attention_mask: torch.Tensor = None,
+    in_proj_weights: torch.Tensor = None,
+    q_proj_weights: torch.Tensor = None,
+    k_proj_weights: torch.Tensor = None,
+    v_proj_weights: torch.Tensor = None,
+    out_proj_weights: torch.Tensor = None,
+    in_proj_bias: torch.Tensor = None,
+    out_proj_bias: torch.Tensor = None,
+    is_causal: Optional[bool] = False,
+    key_padding_mask: Optional[torch.Tensor] = None,
+    bias_k: Optional[torch.Tensor] = None,
+    bias_v: Optional[torch.Tensor] = None,
+    static_k: Optional[torch.Tensor] = None,
+    static_v: Optional[torch.Tensor] = None,
+    add_zero_attn: bool = False,
+    return_attention_weights: Optional[bool] = False,
+    average_attention_weights: Optional[bool] = True,
+    dropout: Optional[float] = 0.0,
+    training: Optional[bool] = False,
+    out: torch.Tensor = None,
+) -> torch.Tensor:
+    if key is None and value is None:
+        key = value = query
+    emb_dim = _get_embed_dim(
+        in_proj_weights,
+        q_proj_weights,
+        k_proj_weights,
+        v_proj_weights,
+        query,
+    )[1]
+    num_dims = query.ndim
+    if num_dims == 3 and batch_first:
+        query, key, value = [torch.swapaxes(x, 0, 1) for x in [query, key, value]]
+    ret = torch.nn.functional.multi_head_attention_forward(
+        query,
+        key,
+        value,
+        emb_dim,
+        num_heads,
+        in_proj_weights,
+        in_proj_bias,
+        bias_k,
+        bias_v,
+        add_zero_attn,
+        dropout,
+        out_proj_weights,
+        out_proj_bias,
+        training=training,
+        key_padding_mask=key_padding_mask,
+        need_weights=return_attention_weights,
+        attn_mask=attention_mask,
+        use_separate_proj_weight=not ivy.exists(in_proj_weights),
+        q_proj_weight=q_proj_weights,
+        k_proj_weight=k_proj_weights,
+        v_proj_weight=v_proj_weights,
+        static_k=static_k,
+        static_v=static_v,
+        average_attn_weights=average_attention_weights,
+        is_causal=is_causal,
+    )
+    ret = list(ret) if isinstance(ret, tuple) else [ret]
+    if num_dims == 3 and batch_first:
+        ret[0] = ret[0].swapaxes(0, 1)
+    if return_attention_weights:
+        return tuple(ret)
+    return ret[0]
+
+
+multi_head_attention.partial_mixed_handler = (
+    lambda *args, scale=None, out_proj_weights=None, is_causal=False, attention_mask=None, return_attention_weights=False, in_proj_weights=None, q_proj_weights=None, k_proj_weights=None, v_proj_weights=None, **kwargs: not ivy.exists(
+        scale
+    )
+    and ivy.exists(out_proj_weights)
+    and (not is_causal or ivy.exists(attention_mask))
+    and (not is_causal or not return_attention_weights)
+    and (
+        ivy.exists(in_proj_weights)
+        or all(
+            [ivy.exists(x) for x in [q_proj_weights, k_proj_weights, v_proj_weights]]
+        )
+    )
+    and len(
+        set(
+            _get_embed_dim(
+                in_proj_weights, q_proj_weights, k_proj_weights, v_proj_weights, args[0]
+            )
+        )
+    )
+    == 1
+)
+
+
+def _get_embed_dim(
+    in_proj_weights, q_proj_weights, k_proj_weights, v_proj_weights, query
+):
+    pre_embed_dim = query.shape[-1]
+    if ivy.exists(in_proj_weights):
+        embed_dim = in_proj_weights.shape[0] / 3
+    elif all([ivy.exists(x) for x in [q_proj_weights, k_proj_weights, v_proj_weights]]):
+        embed_dim = q_proj_weights.shape[0]
+    else:
+        embed_dim = None
+    return pre_embed_dim, embed_dim
+
+
 @with_unsupported_dtypes(
     {"2.0.1 and below": ("float16", "bfloat16", "complex")},
     backend_version,
diff --git a/ivy/functional/frontends/torch/nn/functional/non_linear_activation_functions.py b/ivy/functional/frontends/torch/nn/functional/non_linear_activation_functions.py
index 03d62ea8eacb8..317032daab47b 100644
--- a/ivy/functional/frontends/torch/nn/functional/non_linear_activation_functions.py
+++ b/ivy/functional/frontends/torch/nn/functional/non_linear_activation_functions.py
@@ -266,143 +266,36 @@ def multi_head_attention_forward(
     average_attn_weights=True,
     is_causal=False,
 ):
-    # q/k/v shape: (seq_len, batch_size, embed_dim)
-    seq_len, batch_size, embed_dim = query.shape
+    embed_dim = query.shape[-1]
     assert (
         embed_dim == embed_dim_to_check
     ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
-    assert key.shape == value.shape
-
-    head_dim = embed_dim // num_heads
-    assert head_dim * num_heads == embed_dim, "embed_dim needs to be divisible by heads"
-    scale = ivy.sqrt(head_dim)
-
-    if use_separate_proj_weight:
-        assert key.shape[:2] == value.shape[:2], (
-            f"key's sequence and batch dims {key.shape[:2]} do not match value's"
-            f" {value.shape[:2]}"
-        )
-    else:
-        assert (
-            key.shape == value.shape
-        ), f"key shape {key.shape} does not match value shape {value.shape}"
-
-    if is_causal and key_padding_mask is None and not need_weights:
-        mask = ivy.tril(ivy.ones((seq_len, seq_len), dtype=query.dtype), k=0)
-        attn_mask = ivy.zeros((seq_len, seq_len), dtype=query.dtype)
-        attn_mask = ivy.where(mask == 0.0, float("-inf"), 0)
-
-    if in_proj_bias is None:
-        q_bias, k_bias, v_bias = None, None, None
-    else:
-        q_bias, k_bias, v_bias = ivy.split(in_proj_bias, num_or_size_splits=3)
-
-    if not use_separate_proj_weight:
-        q_proj_weight, k_proj_weight, v_proj_weight = ivy.split(
-            in_proj_weight, num_or_size_splits=3
-        )
-
-    q = ivy.linear(query, q_proj_weight, bias=q_bias)
-    k = ivy.linear(key, k_proj_weight, bias=k_bias)
-    v = ivy.linear(value, v_proj_weight, bias=v_bias)
-
-    if bias_k is not None and bias_v is not None:
-        assert static_k is None, "bias cannot be added to static key."
-        assert static_v is None, "bias cannot be added to static value."
-        k = ivy.concat([k, ivy.tile(bias_k, (1, batch_size, 1))])
-        v = ivy.concat([v, ivy.tile(bias_v, (1, batch_size, 1))])
-        if attn_mask is not None:
-            attn_mask = ivy.concat(
-                [attn_mask, ivy.zeros((attn_mask.shape[0], 1), dtype=attn_mask.dtype)],
-                axis=1,
-            )
-        if key_padding_mask is not None:
-            key_padding_mask = ivy.concat(
-                [
-                    key_padding_mask,
-                    ivy.zeros(
-                        (key_padding_mask.shape[0], 1), dtype=key_padding_mask.dtype
-                    ).bool(),
-                ],
-                axis=1,
-            )
-
-    q = ivy.swapaxes(q.reshape((q.shape[0], batch_size * num_heads, head_dim)), 0, 1)
-
-    if static_k is None:
-        k = ivy.swapaxes(
-            k.reshape((k.shape[0], batch_size * num_heads, head_dim)), 0, 1
-        )
-    else:
-        assert static_k.shape[0] == batch_size * num_heads, (
-            f"expecting static_k.shape[0] of {batch_size * num_heads}, but got"
-            f" {static_k.shape[0]}"
-        )
-        assert (
-            static_k.shape[2] == head_dim
-        ), f"expecting static_k.shape[2] of {head_dim}, but got {static_k.shape[2]}"
-        k = static_k
-
-    if static_v is None:
-        v = ivy.swapaxes(
-            v.reshape((v.shape[0], batch_size * num_heads, head_dim)), 0, 1
-        )
-    else:
-        assert static_v.shape[0] == batch_size * num_heads, (
-            f"expecting static_v.shape[0] of {batch_size * num_heads}, but got"
-            f" {static_v.shape[0]}"
-        )
-        assert (
-            static_v.shape[2] == head_dim
-        ), f"expecting static_v.shape[2] of {head_dim}, but got {static_v.shape[2]}"
-        v = static_v
-
-    # TODO add_zero_attn doesn't work for all cases
-    # fix this and add test cases (by changing to add_zero_attn=st.booleans())
-    if add_zero_attn:
-        zero_attn_shape = (batch_size * num_heads, 1, head_dim)
-        k = ivy.concat([k, ivy.zeros(zero_attn_shape, dtype=k.dtype)], axis=1)
-        v = ivy.concat([v, ivy.zeros(zero_attn_shape, dtype=v.dtype)], axis=1)
-        if attn_mask is not None:
-            attn_mask = ivy.pad(attn_mask, [(0, 0), (0, 1)])
-        if key_padding_mask is not None:
-            key_padding_mask = ivy.pad(key_padding_mask, [(0, 0), (0, 1)])
-
-    src_len = k.shape[1]
-    attn_weights = ivy.matmul(q, ivy.swapaxes(k, 1, 2))
-    assert list(attn_weights.shape) == [batch_size * num_heads, seq_len, src_len]
-
-    attn_weights = attn_weights / scale
-
-    if attn_mask is not None:
-        attn_mask = ivy.expand_dims(attn_mask, axis=0)
-        attn_weights += attn_mask
-
-    if key_padding_mask is not None:
-        key_padding_mask = ivy.expand_dims(
-            ivy.expand_dims(key_padding_mask, axis=1), axis=2
-        )
-        attn_weights = attn_weights.reshape((batch_size, num_heads, seq_len, src_len))
-        attn_weights = ivy.where(key_padding_mask < 0.0, float("-inf"), attn_weights)
-        attn_weights = attn_weights.reshape((batch_size * num_heads, seq_len, src_len))
-
-    attn_weights = ivy.softmax(attn_weights, axis=-1)
-    attn_weights = ivy.dropout(attn_weights, dropout_p, training=training)
-
-    attn_output = ivy.matmul(attn_weights, v)
-    assert list(attn_output.shape) == [batch_size * num_heads, seq_len, head_dim]
-    attn_output = ivy.swapaxes(attn_output, 0, 1).reshape(
-        (seq_len, batch_size, embed_dim)
+    return ivy.multi_head_attention(
+        query,
+        key=key,
+        value=value,
+        batch_first=False,
+        num_heads=num_heads,
+        attention_mask=attn_mask,
+        in_proj_weights=in_proj_weight if not use_separate_proj_weight else None,
+        q_proj_weights=q_proj_weight,
+        k_proj_weights=k_proj_weight,
+        v_proj_weights=v_proj_weight,
+        out_proj_weights=out_proj_weight,
+        in_proj_bias=in_proj_bias,
+        out_proj_bias=out_proj_bias,
+        is_causal=is_causal and not (need_weights or key_padding_mask is not None),
+        key_padding_mask=key_padding_mask,
+        bias_k=bias_k,
+        bias_v=bias_v,
+        static_k=static_k,
+        static_v=static_v,
+        add_zero_attn=add_zero_attn,
+        return_attention_weights=need_weights,
+        average_attention_weights=average_attn_weights,
+        dropout=dropout_p,
+        training=training,
     )
-    attn_output = ivy.linear(attn_output, out_proj_weight, bias=out_proj_bias)
-
-    if need_weights:
-        attn_weights = attn_weights.reshape((batch_size, num_heads, seq_len, src_len))
-        if average_attn_weights:
-            attn_weights = ivy.sum(attn_weights, axis=1) / num_heads
-        return (attn_output, attn_weights)
-    else:
-        return (attn_output,)
 
 
 @to_ivy_arrays_and_back
diff --git a/ivy/functional/ivy/layers.py b/ivy/functional/ivy/layers.py
index e9b942da7ebab..2a428fbfd91a5 100644
--- a/ivy/functional/ivy/layers.py
+++ b/ivy/functional/ivy/layers.py
@@ -705,7 +705,7 @@ def scaled_dot_product_attention(
 @handle_exceptions
 @handle_nestable
 @handle_out_argument
-# @handle_array_like_without_promotion
+@handle_partial_mixed_function
 @inputs_to_ivy_arrays
 @handle_array_function
 def multi_head_attention(
@@ -714,6 +714,7 @@ def multi_head_attention(
     *,
     key: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
     value: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    batch_first: bool = True,
     num_heads: int = 8,
     scale: Optional[float] = None,
     attention_mask: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
@@ -725,6 +726,12 @@ def multi_head_attention(
     in_proj_bias: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
     out_proj_bias: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
     is_causal: bool = False,
+    key_padding_mask: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    bias_k: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    bias_v: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    static_k: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    static_v: Optional[Union[ivy.Array, ivy.NativeArray]] = None,
+    add_zero_attn: bool = False,
     return_attention_weights: bool = False,
     average_attention_weights: bool = True,
     dropout: float = 0.0,
@@ -743,50 +750,69 @@ def multi_head_attention(
     value_dim)`. Then, the query and key tensors are dot-producted and scaled. These are
     softmaxed to obtain attention probabilities. The value tensors are then interpolated
     by these probabilities, then concatenated back to a single tensor. Finally, the
-    result tensor with the last dimension as value_dim can take an linear projection and
+    result tensor with the last dimension as value_dim can take a linear projection and
     return.
 
     Parameters
     ----------
     query
-        query embeddings *[batch_shape,num_queries,query_dim]*.
+        The query embeddings. Shape: `(L, Q)` or `(N, L, Q)`, where L is the number of
+        queries, N is the batch size, Q is the query embedding dimension.
     key
-        key embeddings *[batch_shape,num_queries,key_dim]*.
+        The key embeddings. Shape: `(S, K)` or `(N, S, K)`, where S is the number of
+        keys, N is the batch size, K is the key embedding dimension.
     value
-        value embeddings *[batch_shape,num_queries,value_dim]*.
+        The value embeddings. Shape `(S, V)` or `(N, S, V)`, where S is the number of
+        keys, N is the batch size, V is the value embedding dimension.
+    batch_first
+        If False, `query`, `key` and `value` will have shapes `(L, N, Q)`, `(S, N, K)`
+        and `(S, N, V)` respectively (if batched).
     num_heads
         The number of attention heads to use.
     scale
         The value by which to scale the query-key similarity measure before softmax.
     attention_mask
-        The mask to apply to the query-key values. Default is ``None``.
-        *[batch_shape,num_queries,num_keys]*.
+        The mask to apply to the query-key values. Shape: `(L, S)` or
+        `(N*num_heads, L, S)`.
     in_proj_weights
-        The weights used to project query, key and value *[3*E, E].
+        The weights used to project query, key and value. Shape: `(3*E, E')`,  where E
+        is the new embedding dimension and E' is the input embedding dimension, i.e.
+        `E' = Q = K = V`.
     q_proj_weights
-        The weights used to project query if in_proj_weights is None *[new_E, E].
+        The weights used to project query if `in_proj_weights` is None. Shape: `(E, Q)`.
     k_proj_weights
-        The weights used to project key if in_proj_weights is None *[new_E, E].
+        The weights used to project key if `in_proj_weights` is None. Shape: `(E, K)`.
     v_proj_weights
-        The weights used to project value if in_proj_weights is None *[new_E, E].
+        The weights used to project value if `in_proj_weights` is None. Shape: `(E, V)`.
     out_proj_weights
-        The weights used to project the output.
+        The weights used to project the attention output. Shape: `(O, E)`, where O is
+        the output embedding dimension.
     in_proj_bias
-        The bias used when projecting with query, key and value.
+        The bias used when projecting query, key and value. Shape: `(3*E,)`.
     out_proj_bias
-        The bias used when projecting the output.
+        The bias used when projecting the output. Shape: `(O,)`.
     is_causal
-        If True, Uses a causal attention mask and ignores provided attention_mask.
+        If True, use a causal attention mask and ignore the provided `attention_mask`.
+    key_padding_mask
+        A binary mask to apply to the key sequence. Shape: `(S,)` or `(N, S)`.
+    bias_k
+        An additional bias added to the key sequence. Shape: `(E,)`.
+    bias_v
+        An additional bias added to the value sequence. Shape: `(E,)`.
+    static_k
+        A static key to be used in the attention operators. Shape: `(N*num_heads, S, E//num_heads)`.
+    static_v
+        A static value to be used in the attention operators. Shape: `(N*num_heads, S, E//num_heads)`.
+    add_zero_attn
+        A boolean flag indicating whether to add a batch of zeros to key and value.
     return_attention_weights
-        If True, returns attention_weights alongside the output
-        as a tuple (output, attenion_weights). Defaults to `False`.
+        If True, return the attention weights alongside the attention output.
     average_attention_weights
-        If true, indicates that the returned ``attention_weights`` should be averaged
-        across heads. Otherwise, ``attention_weights`` are provided separately per head.
-        Note that this flag only has an effect when ``return_attention_weights=True``.
-        Default: ``True`` (i.e. average weights across heads)
+        If True, the returned attention weights will be averaged across heads.
+        Otherwise, the attention weights will be provided separately per head.
+        Note that this flag only has an effect when `return_attention_weights=True`.
     dropout
-        Specifies the dropout probablity, dropout is applied to attention_weights.
+        Specifies the dropout probability. Dropout is applied on the attention weights.
     training
         If True, dropout is used, otherwise dropout is not activated.
     out
@@ -796,9 +822,11 @@ def multi_head_attention(
     Returns
     -------
     ret
-        The output following application of multi-head attention.
-        *[batch_shape,num_queries,out_feat_dim]* if input is batched
-        otherwise *[num_queries, out_feat_dim]
+        The output following the application of multi-head attention. Either `output`
+        or `(output, attention_weights)`. `output` will have shape `(L, E)` if the
+        inputs were unbatched or `(N, L, E)` otherwise, and `attention_weights` will
+        have shape `(L, S)` or `(N, L, S)` respectively. If `batch_first` is False and
+        the inputs were batched, the `output` will have shape `(L, N, E)`.
 
     Both the description and the type hints above assumes an array input for simplicity,
     but this function is *nestable*, and therefore also accepts :class:`ivy.Container`
@@ -814,8 +842,13 @@ def multi_head_attention(
         key = value = query
     if num_dims == 2:
         query, key, value = [ivy.expand_dims(x, axis=0) for x in [query, key, value]]
+    elif not batch_first:
+        query, key, value = [ivy.swapaxes(x, 0, 1) for x in [query, key, value]]
+
+    # project query, key and value
     if ivy.exists(in_proj_weights):
         q, k, v = _in_projection(query, key, value, w=in_proj_weights, b=in_proj_bias)
+        emb_dim = int(in_proj_weights.shape[0] / 3)
     elif all([ivy.exists(x) for x in [q_proj_weights, k_proj_weights, v_proj_weights]]):
         if ivy.exists(in_proj_bias):
             b_q, b_k, b_v = ivy.split(in_proj_bias, num_or_size_splits=3)
@@ -826,61 +859,130 @@ def multi_head_attention(
             ivy.linear(key, k_proj_weights, bias=b_k),
             ivy.linear(value, v_proj_weights, bias=b_v),
         )
+        emb_dim = q_proj_weights.shape[0]
     else:
         q, k, v = query, key, value
-    batch_size, q_seq_length, emb_dim = q.shape[0], q.shape[1], q.shape[-1]
-    k_seq_length = k.shape[1]
+        if ivy.exists(out_proj_weights):
+            emb_dim = out_proj_weights.shape[-1]
+        else:
+            emb_dim = q.shape[-1]
+
+    num_batches, num_queries = query.shape[:2]
     ivy.assertions.check_true(
         emb_dim % num_heads == 0, "features must be divisible by number of heads"
     )
-    dims_per_head = emb_dim // num_heads
-    # isolate heads
-    q = q.reshape((batch_size, q_seq_length, num_heads, dims_per_head)).permute_dims(
-        (0, 2, 1, 3)
-    )
-    k = k.reshape((batch_size, k_seq_length, num_heads, dims_per_head)).permute_dims(
-        (0, 2, 3, 1)
-    )
-    v = v.reshape((batch_size, k_seq_length, num_heads, dims_per_head)).permute_dims(
-        (0, 2, 1, 3)
-    )
-    # perform bmm
-    attn_scores = ivy.matmul(q, k)
-    # scale
-    scale = 1 / (dims_per_head**0.5) if not scale else scale
+    head_dim = emb_dim // num_heads
+
+    # apply extra bias
+    if bias_k is not None and bias_v is not None:
+        ivy.assertions.check_true(
+            not (ivy.exists(static_k) or ivy.exists(static_v)),
+            "bias cannot be added to static key or value",
+        )
+        k = ivy.concat([k, ivy.tile(bias_k, (num_batches, 1, 1))], axis=1)
+        v = ivy.concat([v, ivy.tile(bias_v, (num_batches, 1, 1))], axis=1)
+
+    num_keys = k.shape[1]
+
+    # reshape q, k, v for efficient matrix multiplication
+    q = ivy.swapaxes(q.reshape((num_queries, num_batches * num_heads, head_dim)), 0, 1)
+    if static_k is None:
+        k = ivy.swapaxes(k.reshape((num_keys, num_batches * num_heads, head_dim)), 0, 1)
+    else:
+        k = static_k
+    if static_v is None:
+        v = ivy.swapaxes(v.reshape((num_keys, num_batches * num_heads, head_dim)), 0, 1)
+    else:
+        v = static_v
+
+    # add extra batch of zeros to k, v
+    if add_zero_attn:
+        zero_attn_shape = (num_batches * num_heads, 1, head_dim)
+        k = ivy.concat([k, ivy.zeros(zero_attn_shape, dtype=k.dtype)], axis=1)
+        v = ivy.concat([v, ivy.zeros(zero_attn_shape, dtype=v.dtype)], axis=1)
+        num_keys = k.shape[1]
+
+    # get attention scores
+    attn_scores = ivy.matmul(q, ivy.swapaxes(k, 1, 2))
+    scale = 1 / (head_dim**0.5) if not scale else scale
     attn_scores *= scale
-    # apply attention mask
-    if ivy.exists(attention_mask) or is_causal:
+
+    # mask the attention scores
+    if ivy.exists(attention_mask):
+        assert attention_mask.dtype in [query.dtype, ivy.bool], (
+            "was expecting attention_mask of type bool or the same as the input's, but"
+            f" got {attention_mask.dtype}"
+        )
         if is_causal:
-            # create causal mask
-            attention_mask = ivy.tril(ivy.ones((q_seq_length, k_seq_length)))
-        attention_mask = attention_mask.astype("bool")
-        attn_scores = ivy.where(attention_mask, attn_scores, -ivy.inf)
-    # perform softmax
+            mask = ivy.triu(ivy.ones((num_queries, num_keys)), k=1)
+            attention_mask = ivy.where(mask, float("-inf"), 0)
+        elif ivy.is_bool_dtype(attention_mask):
+            attention_mask = ivy.where(attention_mask, float("-inf"), 0)
+        if attention_mask.ndim == 2:
+            attention_mask = ivy.tile(attention_mask, (num_batches * num_heads, 1, 1))
+    if key_padding_mask is not None:
+        assert ivy.is_bool_dtype(key_padding_mask), (
+            "was expecting key_padding_mask of type bool, but got"
+            f" {key_padding_mask.dtype}"
+        )
+        key_padding_mask = ivy.where(key_padding_mask, float("-inf"), 0)
+        if num_dims == 2:
+            key_padding_mask = ivy.expand_dims(key_padding_mask, axis=0)
+        key_padding_mask = ivy.tile(
+            key_padding_mask, (num_batches * num_heads, num_queries, 1)
+        )
+        if attention_mask is None:
+            attention_mask = key_padding_mask
+        else:
+            attention_mask += key_padding_mask
+    if ivy.exists(attention_mask):
+        if bias_k is not None and bias_v is not None and not is_causal:
+            attention_mask = ivy.pad(attention_mask, [(0, 0), (0, 0), (0, 1)])
+        if add_zero_attn and not is_causal:
+            attention_mask = ivy.pad(attention_mask, [(0, 0), (0, 0), (0, 1)])
+        attn_scores += attention_mask.astype(query.dtype)
+
+    # get attention weights
     attn_weights = ivy.softmax(attn_scores, axis=-1)
-    # perform dropout
     attn_weights = ivy.dropout(attn_weights, dropout, training=training)
-    # bmm with values
+
+    # get attention output
     attention_out = ivy.matmul(attn_weights, v)
-    attention_out = attention_out.permute_dims((0, 2, 1, 3)).reshape(
-        (batch_size, q_seq_length, -1)
+    attention_out = ivy.swapaxes(attention_out, 0, 1).reshape(
+        (num_batches, num_queries, emb_dim)
     )
-    # proj out if out_proj_weight exists
     if ivy.exists(out_proj_weights):
         attention_out = ivy.linear(attention_out, out_proj_weights, bias=out_proj_bias)
-    # if input was unbatched, unbatchify the output
+
     if num_dims == 2:
         attention_out = attention_out.squeeze(axis=0)
+    elif not batch_first:
+        attention_out = attention_out.swapaxes(0, 1)
     if return_attention_weights:
+        attn_weights = attn_weights.reshape(
+            (num_batches, num_heads, num_queries, num_keys)
+        )
         if average_attention_weights:
             attn_weights = attn_weights.mean(axis=1)
-            if num_dims == 2:
-                attn_weights = attn_weights.squeeze(axis=0)
+        if num_dims == 2:
+            attn_weights = attn_weights.squeeze(axis=0)
         return attention_out, attn_weights
     else:
         return attention_out
 
 
+multi_head_attention.mixed_backend_wrappers = {
+    "to_add": (
+        "handle_backend_invalid",
+        "handle_out_argument",
+        "inputs_to_native_arrays",
+        "outputs_to_ivy_arrays",
+        "handle_device_shifting",
+    ),
+    "to_skip": ("inputs_to_ivy_arrays", "handle_partial_mixed_function"),
+}
+
+
 # Convolutions #
 
 
diff --git a/ivy_tests/test_ivy/test_frontends/test_torch/test_nn/test_functional/test_non_linear_activation_functions.py b/ivy_tests/test_ivy/test_frontends/test_torch/test_nn/test_functional/test_non_linear_activation_functions.py
index 4c4614c137fc3..010ced1773e0f 100644
--- a/ivy_tests/test_ivy/test_frontends/test_torch/test_nn/test_functional/test_non_linear_activation_functions.py
+++ b/ivy_tests/test_ivy/test_frontends/test_torch/test_nn/test_functional/test_non_linear_activation_functions.py
@@ -1,11 +1,12 @@
 # global
 import ivy
 from hypothesis import assume, strategies as st
-import random
 
 # local
 import ivy_tests.test_ivy.helpers as helpers
+from ivy.functional.backends.torch.layers import _get_embed_dim
 from ivy_tests.test_ivy.helpers import handle_frontend_test
+from ivy_tests.test_ivy.test_functional.test_nn.test_layers import _mha_helper
 
 
 # --- Helpers --- #
@@ -97,170 +98,6 @@ def _x_and_scaled_attention(draw, dtypes):
     return dtype, query, key, value, mask
 
 
-@st.composite
-def mha_forward_args(draw, dtypes):
-    dtype = draw(dtypes)
-    embed_dim = draw(helpers.ints(min_value=2, max_value=4))
-    batch_size = draw(helpers.ints(min_value=1, max_value=2)) * 3
-    seq_len = draw(helpers.ints(min_value=2, max_value=4))
-    shape = (
-        seq_len,
-        batch_size,
-        embed_dim,
-    )
-
-    heads = draw(helpers.ints(min_value=1, max_value=4))
-    head_dim = embed_dim // heads
-    if head_dim * heads != embed_dim:
-        heads = 1
-        head_dim = embed_dim
-
-    if dtype[0] == "float32":
-        is_causal = False
-    else:
-        is_causal = draw(helpers.array_bools(size=1))[0]
-
-    q = draw(
-        helpers.array_values(dtype=dtype[0], shape=shape, min_value=0.1, max_value=1)
-    )
-    k = draw(
-        helpers.array_values(dtype=dtype[0], shape=shape, min_value=0.1, max_value=1)
-    )
-    v = draw(
-        helpers.array_values(dtype=dtype[0], shape=shape, min_value=0.1, max_value=1)
-    )
-    in_proj_weight = draw(
-        helpers.array_values(
-            dtype=dtype[0],
-            min_value=0.1,
-            max_value=1,
-            shape=(embed_dim * 3, embed_dim),
-        )
-    )
-    in_proj_bias = draw(
-        helpers.array_values(
-            dtype=dtype[0],
-            min_value=0.1,
-            max_value=1,
-            shape=(embed_dim * 3,),
-        )
-    )
-
-    if random.randint(0, 1) == 0:
-        use_separate_proj_weight = True
-        q_proj_weight = draw(
-            helpers.array_values(
-                dtype=dtype[0],
-                min_value=0.1,
-                max_value=1,
-                shape=(embed_dim, embed_dim),
-            )
-        )
-        k_proj_weight = draw(
-            helpers.array_values(
-                dtype=dtype[0],
-                min_value=0.1,
-                max_value=1,
-                shape=(embed_dim, embed_dim),
-            )
-        )
-        v_proj_weight = draw(
-            helpers.array_values(
-                dtype=dtype[0],
-                min_value=0.1,
-                max_value=1,
-                shape=(embed_dim, embed_dim),
-            )
-        )
-    else:
-        use_separate_proj_weight = False
-        q_proj_weight = None
-        k_proj_weight = None
-        v_proj_weight = None
-
-    out_proj_weight = draw(
-        helpers.array_values(
-            dtype=dtype[0],
-            min_value=0.1,
-            max_value=1,
-            shape=(embed_dim, embed_dim),
-        )
-    )
-    out_proj_bias = draw(
-        helpers.array_values(
-            dtype=dtype[0],
-            min_value=0.1,
-            max_value=1,
-            shape=(embed_dim,),
-        )
-    )
-    bias_k = random.choice(
-        [
-            draw(
-                helpers.array_values(
-                    dtype=dtype[0],
-                    min_value=0.1,
-                    max_value=1,
-                    shape=(embed_dim,),
-                )
-            ),
-            None,
-        ]
-    )
-    bias_v = bias_k
-
-    if bias_k is None:
-        static_k = random.choice(
-            [
-                draw(
-                    helpers.array_values(
-                        dtype=dtype[0],
-                        min_value=0.1,
-                        max_value=1,
-                        shape=(batch_size * heads, seq_len, head_dim),
-                    )
-                ),
-                None,
-            ]
-        )
-        static_v = static_k
-    else:
-        static_k = None
-        static_v = None
-
-    attn_mask = ivy.ones((seq_len, seq_len), dtype=dtype[0])
-    key_padding_mask = random.choice(
-        [
-            ivy.random_normal(shape=(seq_len, seq_len), dtype=dtype[0]) > 0,
-            None,
-        ]
-    )
-
-    return (
-        dtype,
-        q,
-        k,
-        v,
-        heads,
-        use_separate_proj_weight,
-        embed_dim,
-        in_proj_weight,
-        in_proj_bias,
-        out_proj_weight,
-        out_proj_bias,
-        q_proj_weight,
-        k_proj_weight,
-        v_proj_weight,
-        bias_k,
-        bias_v,
-        static_k,
-        static_v,
-        attn_mask,
-        key_padding_mask,
-        is_causal,
-    )
-
-
 # --- Main --- #
 # ------------ #
 
@@ -852,14 +689,11 @@ def test_torch_mish(
 # multi_head_attention_forward
 @handle_frontend_test(
     fn_tree="torch.nn.functional.multi_head_attention_forward",
-    dtype_mha_args=mha_forward_args(
-        dtypes=helpers.get_dtypes("valid"),
+    dtype_mha_args=_mha_helper(same_pre_embed_dim=True, batch_second=True).filter(
+        lambda args: args[10] is not None
+        and (not args[22] or args[5] is not None)
+        and len(set(_get_embed_dim(*args[6:10], args[1]))) == 1
     ),
-    add_zero_attn=st.just(False),
-    dropout_p=st.sampled_from([0.0, 0.1, 0.2]),
-    training=st.booleans(),
-    need_weights=st.booleans(),
-    average_attn_weights=st.booleans(),
     test_with_out=st.just(False),
 )
 def test_torch_multi_head_attention_forward(
@@ -869,11 +703,6 @@ def test_torch_multi_head_attention_forward(
     frontend,
     test_flags,
     dtype_mha_args,
-    add_zero_attn,
-    dropout_p,
-    training,
-    need_weights,
-    average_attn_weights,
     backend_fw,
 ):
     (
@@ -882,57 +711,69 @@ def test_torch_multi_head_attention_forward(
         k,
         v,
         heads,
-        use_separate_proj_weight,
-        embed_dim,
+        attn_mask,
         in_proj_weight,
-        in_proj_bias,
-        out_proj_weight,
-        out_proj_bias,
         q_proj_weight,
         k_proj_weight,
         v_proj_weight,
+        out_proj_weight,
+        in_proj_bias,
+        out_proj_bias,
+        key_padding_mask,
         bias_k,
         bias_v,
         static_k,
         static_v,
-        attn_mask,
-        key_padding_mask,
+        _,
+        add_zero_attn,
+        dropout_p,
+        training,
         is_causal,
+        need_weights,
+        average_attn_weights,
+        batch_first,
     ) = dtype_mha_args
-
+    if k is None and v is None:
+        k = v = q
+    # re-order the dtypes to match the order of the frontend arguments, not the order
+    # of ivy.multi_head_attention's arguments given by _mha_helper
+    kwargs = {
+        "query": q,
+        "key": k,
+        "value": v,
+        "embed_dim_to_check": q.shape[-1],
+        "num_heads": heads,
+        "in_proj_weight": in_proj_weight,
+        "in_proj_bias": in_proj_bias,
+        "bias_k": bias_k,
+        "bias_v": bias_v,
+        "add_zero_attn": add_zero_attn,
+        "dropout_p": dropout_p,
+        "out_proj_weight": out_proj_weight,
+        "out_proj_bias": out_proj_bias,
+        "training": training,
+        "key_padding_mask": key_padding_mask,
+        "need_weights": need_weights,
+        "attn_mask": attn_mask,
+        "use_separate_proj_weight": in_proj_weight is None,
+        "q_proj_weight": q_proj_weight,
+        "k_proj_weight": k_proj_weight,
+        "v_proj_weight": v_proj_weight,
+        "static_k": static_k,
+        "static_v": static_v,
+        "average_attn_weights": average_attn_weights,
+        "is_causal": is_causal,
+    }
     helpers.test_frontend_function(
-        input_dtypes=dtype,
+        input_dtypes=[str(r.dtype) for r in kwargs.values() if ivy.is_array(r)],
         backend_to_test=backend_fw,
         frontend=frontend,
         test_flags=test_flags,
         fn_tree=fn_tree,
+        atol=1e-03,
         on_device=on_device,
         test_values=not training or dropout_p == 0.0,
-        query=q,
-        key=k,
-        value=v,
-        embed_dim_to_check=embed_dim,
-        num_heads=heads,
-        in_proj_weight=in_proj_weight,
-        in_proj_bias=in_proj_bias,
-        bias_k=bias_k,
-        bias_v=bias_v,
-        add_zero_attn=add_zero_attn,
-        dropout_p=dropout_p,
-        out_proj_weight=out_proj_weight,
-        out_proj_bias=out_proj_bias,
-        training=training,
-        key_padding_mask=key_padding_mask,
-        need_weights=need_weights,
-        attn_mask=attn_mask,
-        use_separate_proj_weight=use_separate_proj_weight,
-        q_proj_weight=q_proj_weight,
-        k_proj_weight=k_proj_weight,
-        v_proj_weight=v_proj_weight,
-        static_k=static_k,
-        static_v=static_v,
-        average_attn_weights=average_attn_weights,
-        is_causal=is_causal,
+        **kwargs,
     )
 
 
diff --git a/ivy_tests/test_ivy/test_functional/test_nn/test_layers.py b/ivy_tests/test_ivy/test_functional/test_nn/test_layers.py
index cb1bd5af5df55..d68219a6323e3 100644
--- a/ivy_tests/test_ivy/test_functional/test_nn/test_layers.py
+++ b/ivy_tests/test_ivy/test_functional/test_nn/test_layers.py
@@ -68,56 +68,55 @@ def _dropout_helper(draw):
 
 
 @st.composite
-def _mha_helper(draw):
+def _mha_helper(draw, same_pre_embed_dim=False, batch_second=False):
     _qkv_same_dim = draw(st.booleans())
     _self_attention = draw(st.booleans())
+    _same_pre_embed_dim = _self_attention or same_pre_embed_dim or draw(st.booleans())
+    batch_first = draw(st.booleans()) and not batch_second
     num_heads = draw(helpers.ints(min_value=1, max_value=3))
     _embed_dim = draw(helpers.ints(min_value=4, max_value=16)) * num_heads
+    _batch_dim = draw(st.sampled_from([(), (1,)]))
+    _num_batches = _batch_dim[0] if len(_batch_dim) else 1
+    dtype = draw(helpers.get_dtypes("valid", full=False))
     _num_queries = draw(helpers.ints(min_value=2, max_value=8))
     _num_keys = draw(helpers.ints(min_value=2, max_value=8))
-    _batch_dim = draw(st.sampled_from([(), (1,)]))
-    dtype = draw(helpers.get_dtypes("float", full=False, prune_function=False))
-    in_proj_bias = None
     in_proj_weights = None
     q_proj_weights = None
     k_proj_weights = None
     v_proj_weights = None
-    _mask_shape = (
-        _num_queries,
-        _num_queries if _self_attention and _qkv_same_dim else _num_keys,
-    )
-    if _qkv_same_dim:
-        _pre_embed_dim = draw(helpers.ints(min_value=4, max_value=16))
-        _q_shape = _batch_dim + (_num_queries, _pre_embed_dim)
-        _kv_shape = _batch_dim + (_num_keys, _pre_embed_dim)
 
+    if _qkv_same_dim:
+        if _same_pre_embed_dim:
+            _pre_embed_dim = _embed_dim
+        else:
+            _pre_embed_dim = draw(helpers.ints(min_value=4, max_value=16))
         q = draw(
             helpers.array_values(
-                shape=_q_shape,
+                shape=(*_batch_dim, _num_queries, _pre_embed_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
         )
         k = draw(
             helpers.array_values(
-                shape=_kv_shape,
+                shape=(*_batch_dim, _num_keys, _pre_embed_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
             if not _self_attention
             else st.none()
         )
         v = draw(
             helpers.array_values(
-                shape=_kv_shape,
+                shape=(*_batch_dim, _num_keys, _pre_embed_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
             if not _self_attention
             else st.none()
@@ -126,102 +125,191 @@ def _mha_helper(draw):
             helpers.array_values(
                 dtype=dtype[0],
                 shape=(3 * _embed_dim, _pre_embed_dim),
-                min_value=0,
+                min_value=-10,
                 max_value=10,
             )
-            if _pre_embed_dim != _embed_dim
+            if not _same_pre_embed_dim or draw(st.booleans())
             else st.none()
         )
     else:
-        _q_dim = draw(helpers.ints(min_value=2, max_value=8))
+        if not same_pre_embed_dim:
+            _q_dim = draw(helpers.ints(min_value=2, max_value=8))
+        else:
+            _q_dim = _embed_dim
         _k_dim = draw(helpers.ints(min_value=2, max_value=8))
         _v_dim = draw(helpers.ints(min_value=2, max_value=8))
-        _q_shape = _batch_dim + (_num_queries, _q_dim)
-        _k_shape = _batch_dim + (_num_keys, _k_dim)
-        _v_shape = _batch_dim + (_num_keys, _v_dim)
         q = draw(
             helpers.array_values(
-                shape=_q_shape,
+                shape=(*_batch_dim, _num_queries, _q_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
         )
         k = draw(
             helpers.array_values(
-                shape=_k_shape,
+                shape=(*_batch_dim, _num_keys, _k_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
         )
         v = draw(
             helpers.array_values(
-                shape=_v_shape,
+                shape=(*_batch_dim, _num_keys, _v_dim),
                 dtype=dtype[0],
-                large_abs_safety_factor=7,
-                small_abs_safety_factor=7,
-                safety_factor_scale="linear",
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
             )
         )
         q_proj_weights = draw(
             helpers.array_values(
                 dtype=dtype[0],
                 shape=(_embed_dim, _q_dim),
-                min_value=0,
-                max_value=2,
+                min_value=-5,
+                max_value=5,
             )
         )
         k_proj_weights = draw(
             helpers.array_values(
                 dtype=dtype[0],
                 shape=(_embed_dim, _k_dim),
-                min_value=0,
-                max_value=2,
+                min_value=-5,
+                max_value=5,
             )
         )
         v_proj_weights = draw(
             helpers.array_values(
                 dtype=dtype[0],
                 shape=(_embed_dim, _v_dim),
-                min_value=0,
-                max_value=2,
+                min_value=-5,
+                max_value=5,
             )
         )
-
     in_proj_bias = draw(
-        helpers.array_values(
-            dtype=dtype[0], shape=(3 * _embed_dim), min_value=0, max_value=10
+        st.one_of(
+            helpers.array_values(
+                dtype=dtype[0],
+                shape=(3 * _embed_dim,),
+                min_value=-10,
+                max_value=10,
+            ),
+            st.none(),
         )
-        | st.none()
     )
+
     _out_dim = draw(helpers.ints(min_value=4, max_value=16))
     out_proj_weights = draw(
-        helpers.array_values(
-            dtype=dtype[0],
-            shape=(_out_dim, _embed_dim),
-            min_value=0,
-            max_value=2,
+        st.one_of(
+            helpers.array_values(
+                dtype=dtype[0],
+                shape=(_out_dim, _embed_dim),
+                min_value=-5,
+                max_value=5,
+            ),
+            st.none(),
         )
-        | st.none()
     )
     out_proj_bias = draw(
-        helpers.array_values(
-            dtype=dtype[0], shape=(_out_dim), min_value=0, max_value=10
+        st.one_of(
+            helpers.array_values(
+                dtype=dtype[0],
+                shape=(_out_dim,),
+                min_value=-10,
+                max_value=10,
+            ),
+            st.none(),
+        )
+    )
+
+    if _self_attention and _qkv_same_dim:
+        _num_keys = _num_queries
+    _static_shape = (_num_batches * num_heads, _num_keys, int(_embed_dim // num_heads))
+    static_k = draw(
+        st.one_of(
+            helpers.array_values(
+                shape=_static_shape,
+                dtype=dtype[0],
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
+            ),
+            st.none(),
+        )
+    )
+    static_v = draw(
+        st.one_of(
+            helpers.array_values(
+                shape=_static_shape,
+                dtype=dtype[0],
+                max_value=1000,
+                min_value=-1000,
+                abs_smallest_val=1e-06,
+            ),
+            st.none(),
         )
-        | st.none()
     )
+
+    _mask_shape = (_num_queries, _num_keys)
+    if len(_batch_dim) and draw(st.booleans()):
+        _mask_shape = (_num_batches * num_heads, *_mask_shape)
     attention_mask = draw(
+        st.one_of(
+            helpers.array_values(
+                dtype=draw(st.sampled_from(["bool", dtype[0]])),
+                allow_inf=True,
+                shape=_mask_shape,
+            ),
+            st.none(),
+        )
+    )
+
+    key_padding_mask = draw(
+        st.one_of(
+            helpers.array_values(
+                dtype="bool",
+                shape=(*_batch_dim, _num_keys),
+            ),
+            st.none(),
+        )
+    )
+
+    _extra_bias = (
+        (not _qkv_same_dim or _pre_embed_dim == _embed_dim)
+        and static_k is None
+        and static_v is None
+        and draw(st.booleans())
+    )
+    bias_k = draw(
         helpers.array_values(
-            dtype="bool",
-            shape=_mask_shape,
+            dtype=dtype[0], shape=(_embed_dim,), min_value=-10, max_value=10
         )
-        | st.none()
+        if _extra_bias
+        else st.none()
     )
-    return (
-        dtype,
+    bias_v = draw(
+        helpers.array_values(
+            dtype=dtype[0], shape=(_embed_dim,), min_value=-10, max_value=10
+        )
+        if _extra_bias
+        else st.none()
+    )
+
+    scale = draw(st.one_of(st.floats(min_value=0.001), st.none()))
+    add_zero_attn = draw(st.booleans())
+    dropout = draw(st.floats(min_value=0, max_value=0.99))
+    training = draw(st.booleans())
+    is_causal = draw(st.booleans())
+    return_attention_weights = draw(st.booleans())
+    average_attention_weights = draw(st.booleans())
+
+    if len(q.shape) == 3 and not batch_first:
+        q, k, v = [np.swapaxes(x, 0, 1) if x is not None else x for x in [q, k, v]]
+
+    ret = (
         q,
         k,
         v,
@@ -234,7 +322,22 @@ def _mha_helper(draw):
         out_proj_weights,
         in_proj_bias,
         out_proj_bias,
+        key_padding_mask,
+        bias_k,
+        bias_v,
+        static_k,
+        static_v,
+        scale,
+        add_zero_attn,
+        dropout,
+        training,
+        is_causal,
+        return_attention_weights,
+        average_attention_weights,
+        batch_first,
     )
+    ret_dtypes = [str(r.dtype) for r in ret if ivy.is_array(r)]
+    return ret_dtypes, *ret
 
 
 @st.composite
@@ -1274,23 +1377,14 @@ def test_lstm_update(*, dtype_lstm, test_flags, backend_fw, fn_name, on_device):
 @handle_test(
     fn_tree="functional.ivy.multi_head_attention",
     dtype_mha=_mha_helper(),
-    scale=st.one_of(st.floats(), st.none()),
-    dropout=st.floats(min_value=0, max_value=0.99),
-    training=st.just(False),  # st.booleans(), disabled until proper testing is used
-    is_causal=st.booleans(),
-    return_attention_weights=st.booleans(),
-    average_attention_weights=st.booleans(),
-    ground_truth_backend="jax",
+    ground_truth_backend="numpy",
+    # ToDo: fix the gradients and the container methods
+    test_gradients=st.just(False),
+    container_flags=st.just([False]),
 )
 def test_multi_head_attention(
     *,
     dtype_mha,
-    scale,
-    dropout,
-    training,
-    is_causal,
-    return_attention_weights,
-    average_attention_weights,
     test_flags,
     backend_fw,
     fn_name,
@@ -1310,6 +1404,19 @@ def test_multi_head_attention(
         out_proj_weights,
         in_proj_bias,
         out_proj_bias,
+        key_padding_mask,
+        bias_k,
+        bias_v,
+        static_k,
+        static_v,
+        scale,
+        add_zero_attn,
+        dropout,
+        training,
+        is_causal,
+        return_attention_weights,
+        average_attention_weights,
+        batch_first,
     ) = dtype_mha
     helpers.test_function(
         input_dtypes=dtype,
@@ -1317,11 +1424,13 @@ def test_multi_head_attention(
         backend_to_test=backend_fw,
         fn_name=fn_name,
         on_device=on_device,
+        test_values=(dropout == 0),
         atol_=1e-02,
         rtol_=1e-02,
         query=q,
         key=k,
         value=v,
+        batch_first=batch_first,
         num_heads=num_heads,
         scale=scale,
         attention_mask=attention_mask,
@@ -1333,6 +1442,12 @@ def test_multi_head_attention(
         in_proj_bias=in_proj_bias,
         out_proj_bias=out_proj_bias,
         is_causal=is_causal,
+        key_padding_mask=key_padding_mask,
+        bias_k=bias_k,
+        bias_v=bias_v,
+        static_k=static_k,
+        static_v=static_v,
+        add_zero_attn=add_zero_attn,
         return_attention_weights=return_attention_weights,
         average_attention_weights=average_attention_weights,
         dropout=dropout,