jax-ml · copybara-service · Mar 9, 2026 · Mar 5, 2026 · gemini-code-assist · Mar 5, 2026
diff --git a/jax/_src/pallas/primitives.py b/jax/_src/pallas/primitives.py
@@ -528,7 +528,12 @@ def dot(a, b, trans_a: bool = False, trans_b: bool = False,
     precision = lax.Precision.HIGH if allow_tf32 else lax.Precision.HIGHEST
 
   dtype = jnp.promote_types(_handle_small(a.dtype), _handle_small(b.dtype))
-  out_dtype = jnp.int32 if jnp.issubdtype(dtype, jnp.integer) else jnp.float32
+  if jnp.issubdtype(dtype, jnp.integer):
+    out_dtype = jnp.int32
+  elif dtype == jnp.float64:
+    out_dtype = jnp.float64
+  else:
+    out_dtype = jnp.float32
   return lax.dot_general(
       a,
       b,

diff --git a/jax/_src/pallas/triton/lowering.py b/jax/_src/pallas/triton/lowering.py
@@ -2305,26 +2305,46 @@ def _dot_general_lowering(
       input_precision = None
 
     acc_dtype = out_aval.dtype
-    if acc_dtype != jnp.int32 and acc_dtype != jnp.float16:
+    if acc_dtype not in (jnp.int32, jnp.float16, jnp.float64):
       acc_dtype = jnp.float32
   else:
     raise NotImplementedError(f"Unsupported dot precision: {precision}.")
 
   a_type = ir.RankedTensorType(a.type)
   b_type = ir.RankedTensorType(b.type)
-  if len(a_type.shape) != len(b_type.shape) != 2:
+  if len(a_type.shape) != 2 or len(b_type.shape) != 2:
     raise ValueError("a and b must be 2D, but got:"
                      f" {a_type.shape} and {b_type.shape}")
-  if min(*b_type.shape) < 16:
-    raise ValueError("all dimensions of b must be >= 16 ")
+
+  m, k = a_type.shape
+  _, n = b_type.shape
+  if a_type.element_type == ir.F64Type.get():
+    # Triton's MMAv2 fp64 path uses the m8n8k4 PTX instruction but aggregates
+    # it with NumRegisters={m:2, n:1, k:4}, producing an effective m16n8k16
+    # per-warp tile.  Blocks smaller than these minimums cause repM/repN/repK
+    # to round to zero, corrupting the ValueTable and segfaulting the compiler.
+    #   M >= 16  (2 × instrM=8)
+    #   N >=  8  (1 × instrN=8)
+    #   K >= 16  (4 × instrK=4)
+    errors = []
+    if m < 16:
+      errors.append(f"M={m} < 16")
+    if n < 8:
+      errors.append(f"N={n} < 8")
+    if k < 16:
+      errors.append(f"K={k} < 16")
+    if errors:
+      raise ValueError(
+          f"float64 dot requires M>=16, N>=8, K>=16 per warp tile "
+          f"(Triton MMAv2 m8n8k4 layout); got {', '.join(errors)}"
+      )
+
   if a_type.element_type != b_type.element_type:
     raise ValueError(
         "a and b must have the same element type, but got:"
         f" {a_type.element_type} and {b_type.element_type}"
     )
 
-  m, _ = a_type.shape
-  _, n = b_type.shape
   assert acc_dtype is not None
   acc = _zeros(ir.RankedTensorType.get([m, n], _dtype_to_ir_type(acc_dtype)))
 

diff --git a/tests/pallas/triton_pallas_test.py b/tests/pallas/triton_pallas_test.py
@@ -47,9 +47,19 @@ def setUp(self):
       if not self.INTERPRET:
         self.skipTest("On CPU the test works only in interpret mode")
     elif jtu.test_device_matches(["gpu"]):
+      is_sm80_test = any(
+          getattr(self, "_testMethodName", "").startswith(prefix)
+          for prefix in (
+              "test_dot_f32_small_dimensions",
+              "test_dot_fp64_valid_dimensions",
+              "test_dot_fp64_invalid_dimensions",
+          )
+      )
+      min_compute = "8.0" if is_sm80_test else "9.0"
+
       if (jtu.test_device_matches(["cuda"]) and
-          not jtu.is_cuda_compute_capability_at_least("9.0")):
-        self.skipTest("Only works on GPU with capability >= sm90")
+          not jtu.is_cuda_compute_capability_at_least(min_compute)):
+        self.skipTest(f"Only works on GPU with capability >= sm{min_compute.replace('.', '')}")
       if plgpu is None:
         self.skipTest("plgpu not available on this platform")
     else:
@@ -485,6 +495,70 @@ def dot_kernel(x_ref, y_ref, o_ref):
                                 "Unsigned integer dtype.*not supported"):
       dot_kernel(x, y)
 
+  def test_dot_f32_small_dimensions(self):
+    m, k, n = 8, 16, 8
+    dtype = jnp.float32
+
+    @functools.partial(
+        self.pallas_call,
+        out_shape=jax.ShapeDtypeStruct((m, n), dtype),
+        compiler_params=plgpu.CompilerParams(num_warps=1),
+    )
+    def dot_kernel(x_ref, y_ref, o_ref):
+      o_ref[()] = pl.dot(x_ref[()], y_ref[()])
+
+    x = jnp.ones((m, k), dtype=dtype)
+    y = jnp.ones((k, n), dtype=dtype)
+    out = dot_kernel(x, y)
+    np.testing.assert_allclose(out, jnp.full((m, n), k, dtype=dtype))
+
+  def test_dot_fp64_valid_dimensions(self):
+    if not jax.config.jax_enable_x64:
+      self.skipTest("x64 is disabled")
+
+    m, k, n = 16, 16, 8
+    dtype = jnp.float64
+
+    @functools.partial(
+        self.pallas_call,
+        out_shape=jax.ShapeDtypeStruct((m, n), dtype), compiler_params=plgpu.CompilerParams(num_warps=1),
+    )
+    def dot_kernel(x_ref, y_ref, o_ref):
+      o_ref[()] = pl.dot(x_ref[()], y_ref[()])
+
+    x = jnp.arange(m * k).reshape(m, k).astype(dtype)
+    y = jnp.arange(k * n).reshape(k, n).astype(dtype)
+
+    out = dot_kernel(x, y)
+    expected = jnp.dot(x, y, precision=lax.Precision.HIGHEST)
+    np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-5)
+
+  def test_dot_fp64_invalid_dimensions(self):
+    if not jax.config.jax_enable_x64:
+      self.skipTest("x64 is disabled")
+
+    for m, k, n, err_msg in [
+        (8, 16, 16, "M=8 < 16"),
+        (16, 16, 4, "N=4 < 8"),
+        (16, 8, 16, "K=8 < 16"),
+    ]:
+      with self.subTest(f"m={m},k={k},n={n}"):
+        dtype = jnp.float64
+
+        @functools.partial(
+            self.pallas_call,
+            out_shape=jax.ShapeDtypeStruct((m, n), dtype),
+            compiler_params=plgpu.CompilerParams(num_warps=1),
+        )
+        def dot_kernel(x_ref, y_ref, o_ref):
+          o_ref[()] = pl.dot(x_ref[()], y_ref[()])
+
+        x = jnp.arange(m * k).reshape(m, k).astype(dtype)
+        y = jnp.arange(k * n).reshape(k, n).astype(dtype)
+
+        with self.assertRaisesRegex(ValueError, err_msg):
+          dot_kernel(x, y)
+
 
 @functools.partial(
     jax.jit, static_argnames=["bm", "bn", "gm", "bk", "interpret", "debug"]