intel · jenniew · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/src/ATen/native/sparse/xpu/SparseTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseTensorMath.cpp
@@ -1,5 +1,14 @@
 #include <ATen/native/sparse/xpu/sycl/SparseTensorMathKernels.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/addmm.h>
+#endif
+
+#include <ATen/ExpandUtils.h>
+
 namespace at::native {
 
 using namespace at::sparse;
@@ -26,4 +35,68 @@ Tensor _sparse_sum_backward_xpu(
   return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum);
 }
 
+Tensor& s_addmm_out_sparse_dense_xpu(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, const Scalar& beta, const Scalar& alpha) {
+  TORCH_CHECK(t.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'self' to be XPU, but got CPU");
+  TORCH_CHECK(r_.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'out' to be XPU, but got CPU");
+  TORCH_CHECK(sparse_.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'mat1' to be XPU, but got CPU");
+  TORCH_CHECK(dense.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'mat2' to be XPU, but got CPU");
+
+  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
-  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
-  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
+
+  TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " sparse dims");
+  // no need to check dense_dim because dense_dim + sparse_dim = dim
+
+  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
+  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
-  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
-  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
+  // Use a proper sparse matrix multiplication kernel for XPU
+  xpu::addmm_out_sparse_dense_kernel(r_, t, sparse_, dense, beta, alpha);
-  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
-  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
+  // Use a proper sparse matrix multiplication kernel for XPU
+  xpu::addmm_out_sparse_dense_kernel(r_, t, sparse_, dense, beta, alpha);
+
+  return r_;
+}
+
+Tensor s_addmm_sparse_dense_xpu(
+    const Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  Tensor r = at::empty({0}, t.options());
+  s_addmm_out_sparse_dense_xpu(r, t, sparse, dense, beta, alpha);
+  return r;
+}
+
+
+Tensor& addmm_out_sparse_dense_xpu(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_out_sparse_dense_xpu(result, *b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor addmm_sparse_dense_xpu(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_sparse_dense_xpu(*b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor& s_addmm_sparse_dense_xpu_(
+    Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  return s_addmm_out_sparse_dense_xpu(t, t, sparse, dense, beta, alpha);
+}
+
 } // namespace at::native
diff --git a/test/xpu/test_sparse_xpu.py b/test/xpu/test_sparse_xpu.py
@@ -2073,10 +2073,6 @@ def test_shape(di, dj, dk, nnz):
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16, torch.float16)
-    @skipXPUIf(
-        True,
-        "addmm sprase xpu not supported yet, see https://github.com/intel/torch-xpu-ops/issues/2211",
-    )
     def test_sparse_addmm(self, device, dtype, coalesced):
         if (dtype is torch.bfloat16 or dtype is torch.float16) and device.startswith(
             "cuda"

diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
@@ -9463,3 +9463,22 @@
   variants: function, method
 
 - func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmm.out
+  variants: function, method
+  dispatch:
+    SparseXPU: addmm_sparse_dense_xpu
+
+- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    SparseXPU: addmm_out_sparse_dense_xpu
+
+- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmm.out
+  variants: method
+  dispatch:
+    # Warning!  For whatever reason, the inplace sparse addmm is NON
+    # broadcasting
+    SparseXPU: s_addmm_sparse_dense_xpu_