intel · jenniew · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/src/ATen/native/sparse/xpu/SparseTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseTensorMath.cpp
@@ -1,5 +1,15 @@
 #include <ATen/native/sparse/xpu/sycl/SparseTensorMathKernels.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/matmul.h>
+#endif
+
+#include <ATen/ExpandUtils.h>
+
 namespace at::native {
 
 using namespace at::sparse;
@@ -26,4 +36,97 @@ Tensor _sparse_sum_backward_xpu(
   return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum);
 }
 
+Tensor& s_addmm_out_sparse_dense_xpu(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, const Scalar& beta, const Scalar& alpha) {
+  TORCH_CHECK(t.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'self' to be XPU, but got CPU");
+  TORCH_CHECK(r_.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'out' to be XPU, but got CPU");
+  TORCH_CHECK(sparse_.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'mat1' to be XPU, but got CPU");
+  TORCH_CHECK(dense.is_xpu(), "Expected all tensors to be on the same device. addmm: expected 'mat2' to be XPU, but got CPU");
+
+  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
-  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
-  // TORCH_CHECK(xpu::check_device({sparse_, r_, t, dense}));
+
+  TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " sparse dims");
+  // no need to check dense_dim because dense_dim + sparse_dim = dim
+
+  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
+  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
-  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
-  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
+  // Use a proper sparse matrix multiplication kernel for XPU
+  xpu::addmm_out_sparse_dense_kernel(r_, t, sparse_, dense, beta, alpha);
-  Tensor mat1_dense = sparse_._to_dense(std::nullopt, std::nullopt);
-  at::addmm_out(r_, t, mat1_dense, dense, beta, alpha);
+  // Use a proper sparse matrix multiplication kernel for XPU
+  xpu::addmm_out_sparse_dense_kernel(r_, t, sparse_, dense, beta, alpha);
+
+  return r_;
+}
+
+Tensor s_addmm_sparse_dense_xpu(
+    const Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  Tensor r = at::empty({0}, t.options());
+  s_addmm_out_sparse_dense_xpu(r, t, sparse, dense, beta, alpha);
+  return r;
+}
+
+
+Tensor& addmm_out_sparse_dense_xpu(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_out_sparse_dense_xpu(result, *b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor addmm_sparse_dense_xpu(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_sparse_dense_xpu(*b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor& s_addmm_sparse_dense_xpu_(
+    Tensor& t,
+    const SparseTensor& sparse,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  return s_addmm_out_sparse_dense_xpu(t, t, sparse, dense, beta, alpha);
+}
+
+Tensor sparse_sparse_matmul_xpu(const Tensor& mat1_, const Tensor& mat2_) {
+  TORCH_INTERNAL_ASSERT(mat1_.is_sparse());
+  TORCH_INTERNAL_ASSERT(mat2_.is_sparse());
+  TORCH_CHECK(mat1_.dim() == 2);
+  TORCH_CHECK(mat2_.dim() == 2);
+  TORCH_CHECK(mat1_.dense_dim() == 0, "sparse_mm: scalar values expected, mat1 got ", mat1_.dense_dim(), "D values");
+  TORCH_CHECK(mat2_.dense_dim() == 0, "sparse_mm: scalar values expected, mat2 got ", mat2_.dense_dim(), "D values");
+
+  TORCH_CHECK(
+      mat1_.size(1) == mat2_.size(0), "mat1 and mat2 shapes cannot be multiplied (",
+      mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
+
+  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
+           "mat1 dtype ", mat1_.scalar_type(), " does not match mat2 dtype ", mat2_.scalar_type());
+
+  // convert to dense
+  Tensor mat1_dense = mat1_._to_dense(std::nullopt, std::nullopt);
+  Tensor mat2_dense = mat2_._to_dense(std::nullopt, std::nullopt);
+
+  Tensor output_dense = at::matmul(mat1_dense, mat2_dense);
+  // convert back to sparse
+  Tensor output_sparse = output_dense._to_sparse(mat1_.layout());
+
+  return output_sparse;
+
+  // auto output = at::native::empty_like(mat1_);
+  // output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
+}
+
 } // namespace at::native
diff --git a/test/xpu/test_sparse_xpu.py b/test/xpu/test_sparse_xpu.py
@@ -2073,10 +2073,7 @@ def test_shape(di, dj, dk, nnz):
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16, torch.float16)
-    @skipXPUIf(
-        True,
-        "addmm sprase xpu not supported yet, see https://github.com/intel/torch-xpu-ops/issues/2211",
-    )
+    @skipXPUIf(False, "https://github.com/intel/torch-xpu-ops/issues/2211")
     def test_sparse_addmm(self, device, dtype, coalesced):
         if (dtype is torch.bfloat16 or dtype is torch.float16) and device.startswith(
             "cuda"

diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
@@ -9463,3 +9463,39 @@
   variants: function, method
 
 - func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmm.out
+  variants: function, method
+  dispatch:
+    SparseXPU: addmm_sparse_dense_xpu
+
+- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    SparseXPU: addmm_out_sparse_dense_xpu
+
+- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmm.out
+  variants: method
+  dispatch:
+    # Warning!  For whatever reason, the inplace sparse addmm is NON
+    # broadcasting
+    SparseXPU: s_addmm_sparse_dense_xpu_
+
+- func: mm(Tensor self, Tensor mat2) -> Tensor
+  structured_delegate: mm.out
+  variants: function, method
+  dispatch:
+    SparseXPU: _sparse_mm
+  tags: core
+
+- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    SparseXPU: _sparse_mm_out
+
+- func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
+  dispatch:
+    SparseXPU: sparse_sparse_matmul_xpu
+  autogen: _sparse_sparse_matmul.out