[Accuracy diff No.84-85] Fix accuracy diff for paddle.einsum API (#74257)

ooooo-create · web-flow · commit d650414dd4af · 2025-07-29T19:12:06.000+08:00
* fix einsum_grad when contraction with broadcast

* add strict check for the situation.
diff --git a/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h b/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h
@@ -215,6 +215,7 @@ void EinsumGradKernel(const Context& dev_ctx,
     }
     EinsumKernelImpl<T, Context>(dev_ctx,
                                  all_labels,
+                                 labelshape,
                                  operands_for_A,
                                  equation_for_A,
                                  &dA,
@@ -223,6 +224,7 @@ void EinsumGradKernel(const Context& dev_ctx,
 
     EinsumKernelImpl<T, Context>(dev_ctx,
                                  all_labels,
+                                 labelshape,
                                  operands_for_B,
                                  equation_for_B,
                                  &dB,
diff --git a/paddle/phi/kernels/impl/einsum_kernel_impl.h b/paddle/phi/kernels/impl/einsum_kernel_impl.h
@@ -204,7 +204,9 @@ inline static void InferLabelShape(
     const std::vector<std::string>& op_labels,
     const std::vector<DDim>& inputs,
     LabelMap* labelshape,
-    std::vector<std::vector<int64_t>>* broadcast_shapes) {
+    std::vector<std::vector<int64_t>>* broadcast_shapes,
+    LabelMap* labeltype) {
+  LabelMap labelshape_copy = *labelshape;
   VLOG(5) << "Start InferLabelShape";
   for (size_t i = 0; i < op_labels.size(); ++i) {
     auto& op_str = op_labels[i];
@@ -233,6 +235,20 @@ inline static void InferLabelShape(
   }
   for (size_t i = 0; i < op_labels.size(); ++i) {
     for (auto& c : op_labels[i]) {
+      // Note: When broadcasting is involved, ensure the gradient is calculated
+      // with respect to the broadcasted shape. For example, in
+      // einsum("ij,ij->j", x(2,2), y(1,2)), y is broadcast to (2,2). The
+      // gradient calculation for x must use this broadcasted shape of y.
+      if (labelshape_copy.exist(c) && labelshape_copy[c] > (*labelshape)[c]) {
+        // Strict check for the situation.
+        PADDLE_ENFORCE_EQ(
+            (*labelshape)[c] == 1 && ((*labeltype)[c] == LabelType::AO ||
+                                      (*labeltype)[c] == LabelType::BO),
+            true,
+            common::errors::InvalidArgument(
+                "Broadcast dims must be 1 for label: `%c`", c));
+        (*labelshape)[c] = labelshape_copy[c];
+      }
       (*broadcast_shapes)[i].push_back((*labelshape)[c]);
     }
   }
@@ -282,7 +298,7 @@ inline static void ParseEinsumEquation(
   // split_string("->") -> [], we push back a "".
   if (op_labels.empty()) op_labels.emplace_back("");
   GlobalInfo(op_labels, *right, labeltype, all_labels);
-  InferLabelShape(op_labels, inputs, labelshape, broadcast_shapes);
+  InferLabelShape(op_labels, inputs, labelshape, broadcast_shapes, labeltype);
   VLOG(5) << "Einsum Infershape: right:" << *right;
   VLOG(5) << "Einsum Infershape: left :"
           << paddle::string::join_strings(op_labels, '\n');
@@ -603,6 +619,7 @@ DenseTensor TransposeToOutput(const Context& dev_ctx,
 template <typename T, typename Context>
 void EinsumKernelImpl(const Context& dev_ctx,
                       const std::vector<char>& forward_all_labels,
+                      const LabelMap& forward_label_shape,
                       const std::vector<const DenseTensor*>& inputs,
                       const std::string& equation,
                       DenseTensor* out,
@@ -629,6 +646,7 @@ void EinsumKernelImpl(const Context& dev_ctx,
   std::string right;
   if (!is_forward) {
     all_labels = forward_all_labels;
+    labelshape = forward_label_shape;
   }
   ParseEinsumEquation(equation,
                       input_dims,
@@ -680,15 +698,22 @@ void EinsumKernel(const Context& dev_ctx,
     }
   }
   std::vector<char> tmp;
+  LabelMap labelshape_holder;
   // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
   // may have nullptr and the cache.size() is not equal to inputs.size(). refer
   // to BuildPhiKernelContext for details.
   int diff = inputs.size() - cache.size();
   for (int i = 0; i < diff; ++i) {
     cache.push_back(nullptr);
   }
-  EinsumKernelImpl<T, Context>(
-      dev_ctx, tmp, inputs, equation, out, cache, /*forward=*/true);
+  EinsumKernelImpl<T, Context>(dev_ctx,
+                               tmp,
+                               labelshape_holder,
+                               inputs,
+                               equation,
+                               out,
+                               cache,
+                               /*forward=*/true);
 }
 
 template <typename T, typename Context>
@@ -697,13 +722,20 @@ void EinsumInferKernel(const Context& dev_ctx,
                        const std::string& equation,
                        DenseTensor* out) {
   std::vector<char> place_holder;
+  LabelMap labelshape_holder;
   std::vector<DenseTensor*> cache_tensor(
       inputs.size());  // set empty; TA, TB, TdC
   for (size_t i = 0; i < inputs.size(); ++i) {
     cache_tensor[i] = nullptr;
   }
-  EinsumKernelImpl<T, Context>(
-      dev_ctx, place_holder, inputs, equation, out, cache_tensor, true);
+  EinsumKernelImpl<T, Context>(dev_ctx,
+                               place_holder,
+                               labelshape_holder,
+                               inputs,
+                               equation,
+                               out,
+                               cache_tensor,
+                               true);
 }
 
 }  // namespace phi
diff --git a/test/legacy_test/test_einsum.py b/test/legacy_test/test_einsum.py
@@ -532,5 +532,50 @@ def test_static_graph(self):
             self.check_output_equal(a, e)
 
 
+class TestContractionBroadcastGrad(unittest.TestCase):
+    def setUp(self):
+        self.place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
+    def test_case1(self):
+        with paddle.base.dygraph.guard(self.place):
+            # paddle.einsum("i, i", Tensor([2],"float32"), Tensor([1],"float32"), )
+            x_np = np.array([0.1, 0.2]).astype(np.float32)
+            y_np = np.array([0.5]).astype(np.float32)
+            except_res = np.einsum("i, i", x_np, y_np)
+            except_grad_x = np.array([0.5, 0.5]).astype(np.float32)
+            except_grad_y = np.array([0.3]).astype(np.float32)
+            x = paddle.to_tensor(x_np, stop_gradient=False)
+            y = paddle.to_tensor(y_np, stop_gradient=False)
+            res = paddle.einsum("i, i", x, y)
+            np.testing.assert_allclose(res.numpy(), except_res)
+            res.sum().backward()
+            x.grad.get_tensor()  # To check if accessing unallocated memory
+            np.testing.assert_allclose(x.grad.numpy(), except_grad_x)
+            np.testing.assert_allclose(y.grad.numpy(), except_grad_y)
+
+    def test_case2(self):
+        with paddle.base.dygraph.guard(self.place):
+            # paddle.einsum("ij,ij->j", Tensor([2, 2],"float32"), Tensor([1, 2],"float32"), )
+            x_np = np.array([[0.1, 0.2], [0.3, 0.4]]).astype(np.float32)
+            y_np = np.array([[0.5, 0.6]]).astype(np.float32)
+            except_res = np.einsum("ij,ij->j", x_np, y_np)
+            except_grad_x = np.array([[0.5, 0.6], [0.5, 0.6]]).astype(
+                np.float32
+            )
+            except_grad_y = np.array([[0.4, 0.6]]).astype(np.float32)
+            x = paddle.to_tensor(x_np, stop_gradient=False)
+            y = paddle.to_tensor(y_np, stop_gradient=False)
+            res = paddle.einsum("ij,ij->j", x, y)
+            np.testing.assert_allclose(res.numpy(), except_res)
+            res.sum().backward()
+            x.grad.get_tensor()  # To check if accessing unallocated memory
+            np.testing.assert_allclose(x.grad.numpy(), except_grad_x)
+            np.testing.assert_allclose(y.grad.numpy(), except_grad_y)
+
+
 if __name__ == "__main__":
     unittest.main()