From 5e643ca9fe0b99221486d0012ab84a89c3105038 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Wed, 24 Nov 2021 18:22:37 +0800
Subject: [PATCH 01/23] add hinge_embedding_loss

---
 python/paddle/fluid/layers/loss.py            |  58 ++-
 .../unittests/test_hinge_embedding_loss.py    | 366 ++++++++++++++++++
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/loss.py           |  35 ++
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/loss.py                |  20 +
 6 files changed, 464 insertions(+), 18 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 3db4a894d1a07..4572910224c27 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -31,24 +31,12 @@
 from paddle import _C_ops
 
 __all__ = [
-    'center_loss',
-    'bpr_loss',
-    'cross_entropy',
-    'square_error_cost',
-    'edit_distance',
-    'warpctc',
-    'nce',
-    'hsigmoid',
-    'sampled_softmax_with_cross_entropy',
-    'softmax_with_cross_entropy',
-    'rank_loss',
-    'margin_rank_loss',
-    'sigmoid_cross_entropy_with_logits',
-    'teacher_student_sigmoid_loss',
-    'huber_loss',
-    'kldiv_loss',
-    'npair_loss',
-    'mse_loss',
+    'center_loss', 'bpr_loss', 'cross_entropy', 'square_error_cost',
+    'edit_distance', 'warpctc', 'nce', 'hsigmoid',
+    'sampled_softmax_with_cross_entropy', 'softmax_with_cross_entropy',
+    'rank_loss', 'margin_rank_loss', 'sigmoid_cross_entropy_with_logits',
+    'teacher_student_sigmoid_loss', 'huber_loss', 'kldiv_loss', 'npair_loss',
+    'mse_loss', 'hinge_embedding_loss'
 ]
 
 kIgnoreIndex = -100
@@ -1763,3 +1751,37 @@ def mse_loss(input, label):
     check_variable_and_dtype(input, "input", ['float32', 'float64'], 'mse_loss')
     check_variable_and_dtype(label, "label", ['float32', 'float64'], 'mse_loss')
     return nn.reduce_mean(square_error_cost(input, label))
+
+
+def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
+    """
+
+    Returns:
+
+    """
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'hinge_embedding_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'hinge_embedding_loss')
+
+    if (label == 1.).all():
+        loss = input
+    elif (label == -1.).all():
+        loss = paddle.maximum(paddle.to_tensor(0.), delta - input)
+    else:
+        raise ValueError("'label' should contain 1. or -1., "
+                         "but received label containing {}.".format(
+                             label.unique()))
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
new file mode 100644
index 0000000000000..c3038fc5d39e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+class TestFunctionalHingeEmbeddingLoss(unittest.TestCase):
+    def setUp(self):
+        self.delta = 1.0
+        self.shape = (10, 10, 5)
+        self.input_np = np.random.random(size=self.shape).astype(np.float32)
+        self.label_np_1 = np.ones(shape=self.input_np.shape).astype(
+            np.float32)  # 1.
+        self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
+            np.float32)  # -1.
+        self.wrong_label = np.zeros(shape=self.shape).astype(
+            np.float32)  # not 1. and not -1.
+
+    def run_dynamic_label_1(self):
+        """
+        when label is full of 1.
+        """
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np_1)
+        dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
+        expected = np.mean(self.input_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='sum')
+        expected = np.sum(self.input_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='none')
+        expected = self.input_np
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, self.shape)
+
+    def run_dynamic_label_2(self):
+        """
+        when label is full of -1.
+        """
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np_2)
+        dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
+        expected = np.mean(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='sum')
+        expected = np.sum(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='none')
+        expected = np.maximum(0., self.delta - input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, self.shape)
+
+    def run_static_label_1(self, use_gpu=False):
+        input = paddle.fluid.data(
+            name='input', shape=self.shape, dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=self.shape, dtype='float32')
+        result0 = paddle.nn.functional.hinge_embedding_loss(input, label)
+        result1 = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='sum')
+        result2 = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='none')
+        y = paddle.nn.functional.hinge_embedding_loss(input, label, name='aaa')
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np_1},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(self.input_np)
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(self.input_np)
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = self.input_np
+        self.assertTrue(np.allclose(static_result[2], expected))
+
+        self.assertTrue('aaa' in y.name)
+
+    def run_static_label_2(self, use_gpu=False):
+        input = paddle.fluid.data(
+            name='input', shape=self.shape, dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=self.shape, dtype='float32')
+        result0 = paddle.nn.functional.hinge_embedding_loss(input, label)
+        result1 = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='sum')
+        result2 = paddle.nn.functional.hinge_embedding_loss(
+            input, label, reduction='none')
+        y = paddle.nn.functional.hinge_embedding_loss(input, label, name='aaa')
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np_2},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(self.input_np)
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(self.input_np)
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = self.input_np
+        self.assertTrue(np.allclose(static_result[2], expected))
+
+        self.assertTrue('aaa' in y.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_dynamic_label_1()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_1()
+
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_dynamic_label_2()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_2()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_dynamic_label_1()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_1(use_gpu=True)
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_dynamic_label_2()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_2(use_gpu=True)
+
+    # test case the raise message
+    def test_reduce_errors(self):
+        def test_value_error():
+            loss = paddle.nn.functional.hinge_embedding_loss(
+                self.input_np, self.label_np_1, reduction='reduce_mean')
+
+        self.assertRaises(ValueError, test_value_error)
+
+    def test_label_errors(self):
+        def test_value_error():
+            loss = paddle.nn.functional.hinge_embedding_loss(self.input_np,
+                                                             self.wrong_label)
+
+        self.assertRaises(ValueError, test_value_error)
+
+
+class TestClassHingeEmbeddingLoss(unittest.TestCase):
+    def setUp(self):
+        self.delta = 1.0
+        self.shape = (10, 10, 5)
+        self.input_np = np.random.random(size=self.shape).astype(np.float32)
+        self.label_np_1 = np.ones(shape=self.input_np.shape).astype(
+            np.float32)  # 1.
+        self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
+            np.float32)  # -1.
+        self.wrong_label = np.zeros(shape=self.shape).astype(
+            np.float32)  # not 1. and not -1.
+
+    def run_dynamic_label_1(self):
+        """
+        when label is full of 1.
+        """
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np_1)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
+        dy_result = hinge_embedding_loss(input, label)
+        expected = np.mean(self.input_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='sum')
+        dy_result = hinge_embedding_loss(input, label)
+        expected = np.sum(self.input_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='none')
+        dy_result = hinge_embedding_loss(input, label)
+        expected = self.input_np
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, self.shape)
+
+    def run_dynamic_label_2(self):
+        """
+        when label is full of -1.
+        """
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np_1)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
+        dy_result = hinge_embedding_loss(input, label)
+        expected = np.mean(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='sum')
+        dy_result = hinge_embedding_loss(input, label)
+        expected = np.sum(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='none')
+        dy_result = hinge_embedding_loss(input, label)
+        expected = np.maximum(0., self.delta - input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, self.shape)
+
+    def run_static_label_1(self, use_gpu=False):
+        input = paddle.fluid.data(
+            name='input', shape=self.shape, dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=self.shape, dtype='float32')
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
+        result0 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='sum')
+        result1 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='none')
+        result2 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(name='aaa')
+        result3 = hinge_embedding_loss(input, label)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np_1},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(self.input_np)
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(self.input_np)
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = self.input_np
+        self.assertTrue(np.allclose(static_result[2], expected))
+        self.assertTrue('aaa' in result3.name)
+
+    def run_static_label_2(self, use_gpu=False):
+        input = paddle.fluid.data(
+            name='input', shape=self.shape, dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=self.shape, dtype='float32')
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
+        result0 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='sum')
+        result1 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+            reduction='none')
+        result2 = hinge_embedding_loss(input, label)
+        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(name='aaa')
+        result3 = hinge_embedding_loss(input, label)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np_2},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(np.maximum(0., self.delta - input))
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = np.maximum(0., self.delta - input)
+        self.assertTrue(np.allclose(static_result[2], expected))
+        self.assertTrue('aaa' in result3.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_dynamic_label_1()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_1()
+
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_dynamic_label_2()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_2()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_dynamic_label_1()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_1(use_gpu=True)
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_dynamic_label_2()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static_label_2(use_gpu=True)
+
+    # test case the raise message
+    def test_reduce_errors(self):
+        def test_value_error():
+            loss = paddle.nn.functional.hinge_embedding_loss(
+                self.input_np, self.label_np_1, reduction='reduce_mean')
+
+        self.assertRaises(ValueError, test_value_error)
+
+    def test_label_errors(self):
+        def test_value_error():
+            loss = paddle.nn.functional.hinge_embedding_loss(self.input_np,
+                                                             self.wrong_label)
+
+        self.assertRaises(ValueError, test_value_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 1af53e0826be8..3dfeda9a9260e 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -85,6 +85,7 @@
 from .loss import margin_cross_entropy  # noqa: F401
 from .loss import square_error_cost  # noqa: F401
 from .loss import ctc_loss  # noqa: F401
+from .loss import hinge_embedding_loss  # nopa: F401
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
 from .norm import layer_norm  # noqa: F401
@@ -198,6 +199,7 @@
            'margin_cross_entropy',
            'square_error_cost',
            'ctc_loss',
+           'hinge_embedding_loss',
            'affine_grid',
            'grid_sample',
            'local_response_norm',
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 2332c14b2d97a..154113673a7e1 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2051,3 +2051,38 @@ def sigmoid_focal_loss(logit,
         loss = paddle.sum(loss, name=name)
 
     return loss
+
+
+def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
+    """
+
+    Returns:
+
+    """
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+
+    if not paddle.fluid.framework.in_dygraph_mode():
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'hinge_embedding_loss')
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'hinge_embedding_loss')
+
+    if (label == 1.).all():
+        loss = input
+    elif (label == -1.).all():
+        loss = paddle.maximum(paddle.to_tensor(0.), delta - input)
+    else:
+        raise ValueError("'label' should contain 1. or -1., "
+                         "but received label containing {}.".format(
+                             label.unique()))
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index eb7535b16c6e1..eba517a08c2e7 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -72,6 +72,7 @@
 from .loss import MarginRankingLoss  # noqa: F401
 from .loss import CTCLoss  # noqa: F401
 from .loss import SmoothL1Loss  # noqa: F401
+from .loss import HingeEmbeddingLoss  # noqa: F401
 from .norm import BatchNorm1D  # noqa: F401
 from .norm import BatchNorm2D  # noqa: F401
 from .norm import BatchNorm3D  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 3ac0d675fb72c..fd37c58529c0e 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1203,3 +1203,23 @@ def forward(self, input, label):
             reduction=self.reduction,
             delta=self.delta,
             name=self.name)
+
+
+class HingeEmbeddingLoss(Layer):
+    """
+
+    """
+
+    def __init__(self, delta=1.0, reduction="mean", name=None):
+        super(HingeEmbeddingLoss, self).__init__()
+        self.delta = delta
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, label):
+        return F.hinge_embedding_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name)

From 6edb279e182042706db21e96bebb0990ebde1ed0 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 10:13:20 +0800
Subject: [PATCH 02/23] fix test_API

---
 .../unittests/test_hinge_embedding_loss.py    | 63 +++++++++++--------
 python/paddle/nn/functional/__init__.py       |  2 +-
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index c3038fc5d39e2..b5b2456d61eed 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -19,6 +19,8 @@
 import numpy as np
 import unittest
 
+np.random.seed(42)
+
 
 class TestFunctionalHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
@@ -29,8 +31,8 @@ def setUp(self):
             np.float32)  # 1.
         self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
             np.float32)  # -1.
-        self.wrong_label = np.zeros(shape=self.shape).astype(
-            np.float32)  # not 1. and not -1.
+        self.wrong_label = paddle.zeros(shape=self.shape).astype(
+            paddle.float32)  # not 1. and not -1.
 
     def run_dynamic_label_1(self):
         """
@@ -62,19 +64,19 @@ def run_dynamic_label_2(self):
         input = paddle.to_tensor(self.input_np)
         label = paddle.to_tensor(self.label_np_2)
         dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
-        expected = np.mean(np.maximum(0., self.delta - input))
+        expected = np.mean(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='sum')
-        expected = np.sum(np.maximum(0., self.delta - input))
+        expected = np.sum(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='none')
-        expected = np.maximum(0., self.delta - input)
+        expected = np.maximum(0., self.delta - self.input_np)
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
@@ -112,7 +114,8 @@ def run_static_label_2(self, use_gpu=False):
             name='input', shape=self.shape, dtype='float32')
         label = paddle.fluid.data(
             name='label', shape=self.shape, dtype='float32')
-        result0 = paddle.nn.functional.hinge_embedding_loss(input, label)
+        result0 = paddle.nn.functional.hinge_embedding_loss(
+            input, label, name="label 2, mean")
         result1 = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='sum')
         result2 = paddle.nn.functional.hinge_embedding_loss(
@@ -124,14 +127,14 @@ def run_static_label_2(self, use_gpu=False):
         exe.run(fluid.default_startup_program())
         static_result = exe.run(
             feed={"input": self.input_np,
-                  "label": self.label_np_2},
+                  "label": self.label_np_1},
             fetch_list=[result0, result1, result2])
 
-        expected = np.mean(self.input_np)
+        expected = np.mean(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(self.input_np)
+        expected = np.sum(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(static_result[1], expected))
-        expected = self.input_np
+        expected = np.maximum(0., self.delta - self.input_np)
         self.assertTrue(np.allclose(static_result[2], expected))
 
         self.assertTrue('aaa' in y.name)
@@ -178,9 +181,11 @@ def test_value_error():
         self.assertRaises(ValueError, test_value_error)
 
     def test_label_errors(self):
+        paddle.disable_static()
+
         def test_value_error():
-            loss = paddle.nn.functional.hinge_embedding_loss(self.input_np,
-                                                             self.wrong_label)
+            loss = paddle.nn.functional.hinge_embedding_loss(
+                paddle.to_tensor(self.input_np), self.wrong_label)
 
         self.assertRaises(ValueError, test_value_error)
 
@@ -194,8 +199,8 @@ def setUp(self):
             np.float32)  # 1.
         self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
             np.float32)  # -1.
-        self.wrong_label = np.zeros(shape=self.shape).astype(
-            np.float32)  # not 1. and not -1.
+        self.wrong_label = paddle.zeros(shape=self.shape).astype(
+            paddle.float32)  # not 1. and not -1.
 
     def run_dynamic_label_1(self):
         """
@@ -228,24 +233,24 @@ def run_dynamic_label_2(self):
         when label is full of -1.
         """
         input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np_1)
+        label = paddle.to_tensor(self.label_np_2)
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.mean(np.maximum(0., self.delta - input))
+        expected = np.mean(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='sum')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.sum(np.maximum(0., self.delta - input))
+        expected = np.sum(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='none')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.maximum(0., self.delta - input)
+        expected = np.maximum(0., self.delta - self.input_np)
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
@@ -305,11 +310,11 @@ def run_static_label_2(self, use_gpu=False):
                   "label": self.label_np_2},
             fetch_list=[result0, result1, result2])
 
-        expected = np.mean(np.maximum(0., self.delta - input))
+        expected = np.mean(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(np.maximum(0., self.delta - input))
+        expected = np.sum(np.maximum(0., self.delta - self.input_np))
         self.assertTrue(np.allclose(static_result[1], expected))
-        expected = np.maximum(0., self.delta - input)
+        expected = np.maximum(0., self.delta - self.input_np)
         self.assertTrue(np.allclose(static_result[2], expected))
         self.assertTrue('aaa' in result3.name)
 
@@ -328,6 +333,8 @@ def test_cpu(self):
         with fluid.program_guard(fluid.Program()):
             self.run_static_label_2()
 
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+
     def test_gpu(self):
         if not fluid.core.is_compiled_with_cuda():
             return
@@ -346,18 +353,24 @@ def test_gpu(self):
         with fluid.program_guard(fluid.Program()):
             self.run_static_label_2(use_gpu=True)
 
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+
     # test case the raise message
     def test_reduce_errors(self):
         def test_value_error():
-            loss = paddle.nn.functional.hinge_embedding_loss(
-                self.input_np, self.label_np_1, reduction='reduce_mean')
+            hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+                reduction='reduce_mean')
+            loss = hinge_embedding_loss(self.input_np, self.label_np_1)
 
         self.assertRaises(ValueError, test_value_error)
 
     def test_label_errors(self):
+        paddle.disable_static()
+
         def test_value_error():
-            loss = paddle.nn.functional.hinge_embedding_loss(self.input_np,
-                                                             self.wrong_label)
+            hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
+            loss = hinge_embedding_loss(
+                paddle.to_tensor(self.input_np), self.wrong_label)
 
         self.assertRaises(ValueError, test_value_error)
 
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 3dfeda9a9260e..2b5127776f7da 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -85,7 +85,7 @@
 from .loss import margin_cross_entropy  # noqa: F401
 from .loss import square_error_cost  # noqa: F401
 from .loss import ctc_loss  # noqa: F401
-from .loss import hinge_embedding_loss  # nopa: F401
+from .loss import hinge_embedding_loss  # noqa: F401
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
 from .norm import layer_norm  # noqa: F401

From 7e9207e28a3f21707a21465e1201e60356927942 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 10:18:57 +0800
Subject: [PATCH 03/23] test_API succeed

---
 .../unittests/test_hinge_embedding_loss.py    | 167 ------------------
 1 file changed, 167 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index b5b2456d61eed..5d835489bf799 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -80,79 +80,10 @@ def run_dynamic_label_2(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
-    def run_static_label_1(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=self.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=self.shape, dtype='float32')
-        result0 = paddle.nn.functional.hinge_embedding_loss(input, label)
-        result1 = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='sum')
-        result2 = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='none')
-        y = paddle.nn.functional.hinge_embedding_loss(input, label, name='aaa')
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np_1},
-            fetch_list=[result0, result1, result2])
-
-        expected = np.mean(self.input_np)
-        self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(self.input_np)
-        self.assertTrue(np.allclose(static_result[1], expected))
-        expected = self.input_np
-        self.assertTrue(np.allclose(static_result[2], expected))
-
-        self.assertTrue('aaa' in y.name)
-
-    def run_static_label_2(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=self.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=self.shape, dtype='float32')
-        result0 = paddle.nn.functional.hinge_embedding_loss(
-            input, label, name="label 2, mean")
-        result1 = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='sum')
-        result2 = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='none')
-        y = paddle.nn.functional.hinge_embedding_loss(input, label, name='aaa')
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np_1},
-            fetch_list=[result0, result1, result2])
-
-        expected = np.mean(np.maximum(0., self.delta - self.input_np))
-        self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(np.maximum(0., self.delta - self.input_np))
-        self.assertTrue(np.allclose(static_result[1], expected))
-        expected = np.maximum(0., self.delta - self.input_np)
-        self.assertTrue(np.allclose(static_result[2], expected))
-
-        self.assertTrue('aaa' in y.name)
-
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
         self.run_dynamic_label_1()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_1()
-
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
         self.run_dynamic_label_2()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_2()
 
     def test_gpu(self):
         if not fluid.core.is_compiled_with_cuda():
@@ -160,17 +91,7 @@ def test_gpu(self):
 
         paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
         self.run_dynamic_label_1()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_1(use_gpu=True)
-
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
         self.run_dynamic_label_2()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_2(use_gpu=True)
 
     # test case the raise message
     def test_reduce_errors(self):
@@ -254,86 +175,10 @@ def run_dynamic_label_2(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
-    def run_static_label_1(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=self.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=self.shape, dtype='float32')
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
-        result0 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='sum')
-        result1 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='none')
-        result2 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(name='aaa')
-        result3 = hinge_embedding_loss(input, label)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np_1},
-            fetch_list=[result0, result1, result2])
-
-        expected = np.mean(self.input_np)
-        self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(self.input_np)
-        self.assertTrue(np.allclose(static_result[1], expected))
-        expected = self.input_np
-        self.assertTrue(np.allclose(static_result[2], expected))
-        self.assertTrue('aaa' in result3.name)
-
-    def run_static_label_2(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=self.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=self.shape, dtype='float32')
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
-        result0 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='sum')
-        result1 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='none')
-        result2 = hinge_embedding_loss(input, label)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(name='aaa')
-        result3 = hinge_embedding_loss(input, label)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np_2},
-            fetch_list=[result0, result1, result2])
-
-        expected = np.mean(np.maximum(0., self.delta - self.input_np))
-        self.assertTrue(np.allclose(static_result[0], expected))
-        expected = np.sum(np.maximum(0., self.delta - self.input_np))
-        self.assertTrue(np.allclose(static_result[1], expected))
-        expected = np.maximum(0., self.delta - self.input_np)
-        self.assertTrue(np.allclose(static_result[2], expected))
-        self.assertTrue('aaa' in result3.name)
-
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
         self.run_dynamic_label_1()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_1()
-
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
         self.run_dynamic_label_2()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_2()
-
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
 
     def test_gpu(self):
         if not fluid.core.is_compiled_with_cuda():
@@ -341,19 +186,7 @@ def test_gpu(self):
 
         paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
         self.run_dynamic_label_1()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_1(use_gpu=True)
-
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
         self.run_dynamic_label_2()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static_label_2(use_gpu=True)
-
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
 
     # test case the raise message
     def test_reduce_errors(self):

From 89da508da98ba5328185158ceb2216ed0d324ce5 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 13:40:13 +0800
Subject: [PATCH 04/23] add English doc

---
 .../unittests/test_hinge_embedding_loss.py    | 123 +++++-------------
 python/paddle/nn/functional/loss.py           |  84 ++++++++++--
 python/paddle/nn/layer/loss.py                |  69 +++++++++-
 3 files changed, 177 insertions(+), 99 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index 5d835489bf799..d718eca9b7471 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -27,77 +27,53 @@ def setUp(self):
         self.delta = 1.0
         self.shape = (10, 10, 5)
         self.input_np = np.random.random(size=self.shape).astype(np.float32)
-        self.label_np_1 = np.ones(shape=self.input_np.shape).astype(
-            np.float32)  # 1.
-        self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
-            np.float32)  # -1.
-        self.wrong_label = paddle.zeros(shape=self.shape).astype(
-            paddle.float32)  # not 1. and not -1.
-
-    def run_dynamic_label_1(self):
-        """
-        when label is full of 1.
-        """
-        input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np_1)
-        dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
-        expected = np.mean(self.input_np)
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, [1])
-
-        dy_result = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='sum')
-        expected = np.sum(self.input_np)
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, [1])
-
-        dy_result = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='none')
-        expected = self.input_np
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, self.shape)
+        # get label elem in {1., -1.}
+        self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
+        # get wrong label elem not in {1., -1.}
+        self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
-    def run_dynamic_label_2(self):
-        """
-        when label is full of -1.
-        """
+    def run_dynamic_check(self):
         input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np_2)
+        label = paddle.to_tensor(self.label_np, dtype=paddle.float32)
         dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
-        expected = np.mean(np.maximum(0., self.delta - self.input_np))
+        expected = np.mean(
+            np.where(label.numpy() == 1.,
+                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='sum')
-        expected = np.sum(np.maximum(0., self.delta - self.input_np))
+        expected = np.sum(
+            np.where(label.numpy() == 1.,
+                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='none')
-        expected = np.maximum(0., self.delta - self.input_np)
+        expected = np.where(label.numpy() == 1.,
+                            input.numpy(),
+                            np.maximum(0., self.delta - input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
-        self.run_dynamic_label_1()
-        self.run_dynamic_label_2()
+        self.run_dynamic_check()
 
     def test_gpu(self):
         if not fluid.core.is_compiled_with_cuda():
             return
 
         paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
-        self.run_dynamic_label_1()
-        self.run_dynamic_label_2()
+        self.run_dynamic_check()
 
     # test case the raise message
     def test_reduce_errors(self):
         def test_value_error():
             loss = paddle.nn.functional.hinge_embedding_loss(
-                self.input_np, self.label_np_1, reduction='reduce_mean')
+                self.input_np, self.label_np, reduction='reduce_mean')
 
         self.assertRaises(ValueError, test_value_error)
 
@@ -116,84 +92,57 @@ def setUp(self):
         self.delta = 1.0
         self.shape = (10, 10, 5)
         self.input_np = np.random.random(size=self.shape).astype(np.float32)
-        self.label_np_1 = np.ones(shape=self.input_np.shape).astype(
-            np.float32)  # 1.
-        self.label_np_2 = 0. - np.ones(shape=self.input_np.shape).astype(
-            np.float32)  # -1.
-        self.wrong_label = paddle.zeros(shape=self.shape).astype(
-            paddle.float32)  # not 1. and not -1.
-
-    def run_dynamic_label_1(self):
-        """
-        when label is full of 1.
-        """
-        input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np_1)
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
-        dy_result = hinge_embedding_loss(input, label)
-        expected = np.mean(self.input_np)
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, [1])
-
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='sum')
-        dy_result = hinge_embedding_loss(input, label)
-        expected = np.sum(self.input_np)
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, [1])
-
-        hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
-            reduction='none')
-        dy_result = hinge_embedding_loss(input, label)
-        expected = self.input_np
-        self.assertTrue(np.allclose(dy_result.numpy(), expected))
-        self.assertTrue(dy_result.shape, self.shape)
+        # get label elem in {1., -1.}
+        self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
+        # get wrong label elem not in {1., -1.}
+        self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
-    def run_dynamic_label_2(self):
-        """
-        when label is full of -1.
-        """
+    def run_dynamic_check(self):
         input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np_2)
+        label = paddle.to_tensor(self.label_np, dtype=paddle.float32)
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.mean(np.maximum(0., self.delta - self.input_np))
+        expected = np.mean(
+            np.where(label.numpy() == 1.,
+                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='sum')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.sum(np.maximum(0., self.delta - self.input_np))
+        expected = np.sum(
+            np.where(label.numpy() == 1.,
+                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='none')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.maximum(0., self.delta - self.input_np)
+        expected = np.where(label.numpy() == 1.,
+                            input.numpy(),
+                            np.maximum(0., self.delta - input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
-        self.run_dynamic_label_1()
-        self.run_dynamic_label_2()
+        self.run_dynamic_check()
 
     def test_gpu(self):
         if not fluid.core.is_compiled_with_cuda():
             return
 
         paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
-        self.run_dynamic_label_1()
-        self.run_dynamic_label_2()
+        self.run_dynamic_check()
 
     # test case the raise message
     def test_reduce_errors(self):
         def test_value_error():
             hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
                 reduction='reduce_mean')
-            loss = hinge_embedding_loss(self.input_np, self.label_np_1)
+            loss = hinge_embedding_loss(self.input_np, self.label_np)
 
         self.assertRaises(ValueError, test_value_error)
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 154113673a7e1..cafec9d68ffd1 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1720,8 +1720,8 @@ def cross_entropy(input,
                     raise ValueError(
                         "input's class_dimension({}) must equal to "
                         "weight's class_dimension({}) "
-                            "when weight is provided"\
-                        .format(input.shape[axis], weight.shape[-1]))
+                        "when weight is provided" \
+                            .format(input.shape[axis], weight.shape[-1]))
 
                 ignore_weight_mask = paddle.cast((label != ignore_index),
                                                  out.dtype)
@@ -1732,7 +1732,7 @@ def cross_entropy(input,
                                                         axis)
                 if axis != -1 and axis != valid_label.ndim - 1:
                     temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \
+                                + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
                                 + [axis % valid_label.ndim]
                     weight_gather = _C_ops.gather_nd(
                         weight, valid_label.transpose(temp_perm))
@@ -1834,8 +1834,8 @@ def cross_entropy(input,
         else:
             if input.shape[axis] != weight.shape[-1]:
                 raise ValueError("input's class_dimension({}) must equal to "
-                        "weight's class_dimension({}) "
-                            "when weight is provided"\
+                                 "weight's class_dimension({}) "
+                                 "when weight is provided" \
                                  .format(input.shape[axis], weight.shape[-1]))
 
             valid_label = paddle.where(label == ignore_index,
@@ -2054,10 +2054,72 @@ def sigmoid_focal_loss(logit,
 
 
 def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
-    """
+    r"""
+    This operator calculates hinge_embedding_loss. Measures the loss given an input
+    tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
+    used for learning nonlinear embeddings or semi-supervised learning.
+
+    The loss function for :math:`n`-th sample in the mini-batch is
+
+    .. math::
+        l_n = \begin{cases}
+            x_n, & \text{if}\; y_n = 1,\\
+            \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
+        \end{cases}
+
+    and the total loss functions is
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+        \end{cases}
+
+    where :math:`L = \{l_1,\dots,l_N\}^\top`.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
+            The shape of labelis the same as the shape of input.
+        delta (float, optional): Has a default value of `1`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions,
+            available dtype is float32, float64.. The sum operationoperates over all the elements.
+        label: N-D Tensor, same shape as the input.
+        output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
 
     Returns:
+        The tensor variable storing the hinge_embedding_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import bumpy as np
+            import paddle.nn.functional as F
 
+            input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+            # get label with elements in {1., -1.}
+            label_np = 2 * np.random.randint(0, 2, size=(10, 10, 5)) - 1.
+            input = paddle.to_tensor(input_np)
+            label = paddle.to_tensor(label_np, dtype=paddle.float32)
+            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='mean')
+            print(loss)
     """
 
     if reduction not in ['sum', 'mean', 'none']:
@@ -2071,14 +2133,14 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         paddle.fluid.data_feeder.check_variable_and_dtype(
             label, 'label', ['float32', 'float64'], 'hinge_embedding_loss')
 
-    if (label == 1.).all():
-        loss = input
-    elif (label == -1.).all():
-        loss = paddle.maximum(paddle.to_tensor(0.), delta - input)
+    if set(label.unique().numpy()) <= {1., -1.}:
+        loss = paddle.where(
+            label == 1., input,
+            paddle.maximum(paddle.to_tensor(0.), delta - input))
     else:
         raise ValueError("'label' should contain 1. or -1., "
                          "but received label containing {}.".format(
-                             label.unique()))
+                             label.unique().numpy()))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index fd37c58529c0e..7202964b33fff 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1206,8 +1206,75 @@ def forward(self, input, label):
 
 
 class HingeEmbeddingLoss(Layer):
-    """
+    r"""
+    This operator calculates hinge_embedding_loss. Measures the loss given an input
+    tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
+    used for learning nonlinear embeddings or semi-supervised learning.
+
+    The loss function for :math:`n`-th sample in the mini-batch is
+
+    .. math::
+        l_n = \begin{cases}
+            x_n, & \text{if}\; y_n = 1,\\
+            \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
+        \end{cases}
+
+    and the total loss functions is
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+        \end{cases}
 
+    where :math:`L = \{l_1,\dots,l_N\}^\top`.
+
+    Parameters:
+        delta (float, optional): Has a default value of `1`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
+            The shape of labelis the same as the shape of input.
+
+    Shape:
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions,
+            available dtype is float32, float64.. The sum operationoperates over all the elements.
+        label: N-D Tensor, same shape as the input.
+        output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
+
+    Returns:
+        The tensor variable storing the hinge_embedding_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import bumpy as np
+            import paddle.nn as nn
+
+            input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+            # get label with elements in {1., -1.}
+            label_np = 2 * np.random.randint(0, 2, size=(10, 10, 5)) - 1.
+            input = paddle.to_tensor(input_np)
+            label = paddle.to_tensor(label_np, dtype=paddle.float32)
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='mean')
+            loss = hinge_embedding_loss(input, label)
+            print(loss)
     """
 
     def __init__(self, delta=1.0, reduction="mean", name=None):

From 5ea300dec4a1a4653ef3d9aced3bec8a0a1db0d9 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 17:04:12 +0800
Subject: [PATCH 05/23] fixed using of expired fluid api

---
 python/paddle/fluid/layers/loss.py            | 238 ++++++++----------
 .../unittests/test_hinge_embedding_loss.py    |  15 +-
 python/paddle/nn/__init__.py                  |   4 +-
 python/paddle/nn/functional/loss.py           |  20 +-
 python/paddle/nn/layer/loss.py                |  12 +-
 5 files changed, 134 insertions(+), 155 deletions(-)

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 4572910224c27..456c7b072bf3d 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -31,12 +31,24 @@
 from paddle import _C_ops
 
 __all__ = [
-    'center_loss', 'bpr_loss', 'cross_entropy', 'square_error_cost',
-    'edit_distance', 'warpctc', 'nce', 'hsigmoid',
-    'sampled_softmax_with_cross_entropy', 'softmax_with_cross_entropy',
-    'rank_loss', 'margin_rank_loss', 'sigmoid_cross_entropy_with_logits',
-    'teacher_student_sigmoid_loss', 'huber_loss', 'kldiv_loss', 'npair_loss',
-    'mse_loss', 'hinge_embedding_loss'
+    'center_loss',
+    'bpr_loss',
+    'cross_entropy',
+    'square_error_cost',
+    'edit_distance',
+    'warpctc',
+    'nce',
+    'hsigmoid',
+    'sampled_softmax_with_cross_entropy',
+    'softmax_with_cross_entropy',
+    'rank_loss',
+    'margin_rank_loss',
+    'sigmoid_cross_entropy_with_logits',
+    'teacher_student_sigmoid_loss',
+    'huber_loss',
+    'kldiv_loss',
+    'npair_loss',
+    'mse_loss',
 ]
 
 kIgnoreIndex = -100
@@ -52,14 +64,14 @@ def center_loss(input,
     :api_attr: Static Graph
 
     **Center loss Cost layer**
-    
+
     This OP accepts input (deep features,the output of the last hidden layer)
-    and target label and return the center loss cost. The average of the 
-    distances of each sample in the mini-batch from the center of the 
+    and target label and return the center loss cost. The average of the
+    distances of each sample in the mini-batch from the center of the
     corresponding category is calculated as the center loss.
-    
+
     For deep features, :math:`X`, and target labels, :math:`Y`, the equation is:
-    
+
     .. math::
 
         Out = \\frac{1}{2}(X - Y)^2
@@ -70,16 +82,16 @@ def center_loss(input,
                          with shape[N x 1],where N is the batch size. Its dtype should be int32.
         num_classes (int): the number of classification categories.
         alpha (float|Variable): learning rate of centers.
-        param_attr (ParamAttr): Attribute initializer of centers. 
+        param_attr (ParamAttr): Attribute initializer of centers.
         update_center (bool): whether to update value of center.
 
     Returns:
-        Variable: 2-D tensor with shape [N * 1] 
+        Variable: 2-D tensor with shape [N * 1]
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid 
+          import paddle.fluid as fluid
           import paddle
           paddle.enable_static()
 
@@ -479,28 +491,28 @@ def warpctc(input,
     Args:
        input (Variable): The unscaled probabilities of variable-length sequences,
          which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
-         information. When it is a 2-D LodTensor, its shape is 
+         information. When it is a 2-D LodTensor, its shape is
          `[Lp, num_classes + 1]`, where `Lp` is the sum of all input
          sequences' length and `num_classes` is the true number of classes.
-         (not including the blank label). When it is a 3-D Tensor, its shape 
+         (not including the blank label). When it is a 3-D Tensor, its shape
          is `[max_logit_length, batch_size, num_classes + 1]`,
          where `max_logit_length` is the longest length of
          input logit sequence. The data type should be float32 or float64.
        label (Variable): The ground truth of variable-length sequence,
          which must be a 2-D Tensor with LoD information or a 3-D Tensor without
-         LoD information, needs to be consistent with the coressponding input. 
-         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum 
-         of all labels' length. When it is a 3-D Tensor, its shape is 
+         LoD information, needs to be consistent with the coressponding input.
+         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum
+         of all labels' length. When it is a 3-D Tensor, its shape is
          `[batch_size, max_label_length]`, where `max_label_length` is the longest
          length of label sequence. Data type must be int32.
        blank (int, default 0): The blank label index of Connectionist
          Temporal Classification (CTC) loss, which is in the
-         half-opened interval `[0, num_classes + 1)`. The data type must be int32. 
+         half-opened interval `[0, num_classes + 1)`. The data type must be int32.
        norm_by_times(bool, default false): Whether to normalize the gradients
          by the number of time-step, which is also the sequence's length.
          There is no need to normalize the gradients if warpctc layer was
          followed by a mean_op.
-       input_length(Variable): The length for each input sequence if it is 
+       input_length(Variable): The length for each input sequence if it is
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
        label_length(Variable): The length for each label sequence if it is
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
@@ -534,10 +546,10 @@ def warpctc(input,
             cost = fluid.layers.warpctc(input=logits, label=label)
             place = fluid.CPUPlace()
             x = fluid.create_lod_tensor(
-                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"), 
+                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"),
                      [seq_lens], place)
             y = fluid.create_lod_tensor(
-                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"), 
+                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"),
                      [label_lens], place)
             exe = fluid.Executor(place)
             output= exe.run(fluid.default_main_program(),
@@ -650,7 +662,7 @@ def nce(input,
     ${comment}
 
     Args:
-        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim], 
+        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim],
             and data type is float32 or float64.
         label (Tensor): Input label, 2-D tensor with shape [batch_size, num_true_class],
             and data type is int64.
@@ -658,14 +670,14 @@ def nce(input,
         sample_weight (Tensor|None): A Tensor of shape [batch_size, 1]
             storing a weight for each sample. The default weight for each
             sample is 1.0.
-        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
-            Default: None, which means the default weight parameter property is 
+        param_attr (ParamAttr|None): To specify the weight parameter attribute.
+            Default: None, which means the default weight parameter property is
             used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
-            Default: None, which means the default bias parameter property is 
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
+            Default: None, which means the default bias parameter property is
             used. See usage for details in :ref:`api_fluid_ParamAttr` .
         num_neg_samples (int): ${num_neg_samples_comment}.
-        name(str|None): For detailed information, please refer to 
+        name(str|None): For detailed information, please refer to
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
         sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
@@ -675,7 +687,7 @@ def nce(input,
                        custom_dist[i] is the probability of i-th class to be sampled.
                        default: None.
         seed (int, optional): The seed used in sampler. Default 0, means no random seed.
-        is_sparse(bool, optional): The flag indicating whether to use sparse update, 
+        is_sparse(bool, optional): The flag indicating whether to use sparse update,
             the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
 
     Returns:
@@ -861,7 +873,7 @@ def hsigmoid(input,
              is_sparse=False):
     """
     :api_attr: Static Graph
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -1026,50 +1038,50 @@ def sampled_softmax_with_cross_entropy(logits,
     """
     **Sampled Softmax With Cross Entropy Operator.**
 
-    Cross entropy loss with sampled softmax is used as the output layer for 
+    Cross entropy loss with sampled softmax is used as the output layer for
     larger output classes extensively. This operator samples a number of samples
-    for all examples, and computes the softmax normalized values for each 
-    row of the sampled tensor, after which cross-entropy loss is computed. 
+    for all examples, and computes the softmax normalized values for each
+    row of the sampled tensor, after which cross-entropy loss is computed.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
-    
+
     For examples with T true labels (T >= 1), we assume that each true label has
     a probability of 1/T. For each sample, S samples are generated using a
     log uniform distribution. True labels are concatenated with these samples to
     form T + S samples for each example. So, assume the shape of logits is
-    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
-    probability is calculated, which corresponds to the Q(y|x) in 
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
+    probability is calculated, which corresponds to the Q(y|x) in
     [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
-    
-    Logits are sampled according to the sampled labels. Then if 
-    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
-    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+
+    Logits are sampled according to the sampled labels. Then if
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
     make its softmax result close to zero. Then sampled logits are subtracted by
-    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute
     a softmax with cross entropy.
 
     Args:
         logits (Variable): The unscaled log probabilities, which is a 2-D tensor
             with shape [N x K]. N is the batch_size, and K is the class number.
-        label (Variable): The ground truth which is a 2-D tensor. Label is a 
-            Tensor<int64> with shape [N x T], where T is the number of true 
-            labels per example. 
-        num_samples (int): The number for each example, num_samples should be 
+        label (Variable): The ground truth which is a 2-D tensor. Label is a
+            Tensor<int64> with shape [N x T], where T is the number of true
+            labels per example.
+        num_samples (int): The number for each example, num_samples should be
             less than the number of class.
         num_true(int): The number of target classes per training example.
-        remove_accidental_hits (bool): A flag indicating whether to remove 
-            accidental hits when sampling. If True and if a sample[i, j] 
-            accidentally hits true labels, then the corresponding 
-            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+        remove_accidental_hits (bool): A flag indicating whether to remove
+            accidental hits when sampling. If True and if a sample[i, j]
+            accidentally hits true labels, then the corresponding
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result
             close to zero. Default is True.
         use_customized_samples (bool): Whether to use custom samples and probabities to sample
             logits.
         customized_samples (Variable): User defined samples, which is a 2-D tensor
-            with shape [N, T + S]. S is the num_samples, and T is the number of true 
-            labels per example. 
-        customized_probabilities (Variable): User defined probabilities of samples, 
+            with shape [N, T + S]. S is the num_samples, and T is the number of true
+            labels per example.
+        customized_probabilities (Variable): User defined probabilities of samples,
             a 2-D tensor which has the same shape with customized_samples.
         seed (int): The random seed for generating random number, which is used
             in the process of sampling. Default is 0.
@@ -1156,17 +1168,17 @@ def softmax_with_cross_entropy(logits,
                                axis=-1):
     r"""
 
-    This operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    This operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable gradient.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators
+    expects mutually exclusive hard labels, each sample in a batch is in exactly
+    one class with a probability of 1.0. Each sample in the batch will have a
     single label.
 
     The equation is as follows:
@@ -1201,27 +1213,27 @@ def softmax_with_cross_entropy(logits,
     Args:
         logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
         label (Tensor): The ground truth  ``Tensor`` , data type is the same
-            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
-            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
-            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`,
+            Label is a ``Tensor``  in the same shape with :attr:`logits`.
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor``
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
         soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
-                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      if :attr:`soft_label` is set to :attr:`False`.
                                       Default: kIgnoreIndex(-100).
         numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
                                               numerically stable algorithm. Only valid
-                                              when :attr:`soft_label` is :attr:`False` 
-                                              and GPU is used. When :attr:`soft_label` 
-                                              is :attr:`True` or CPU is used, the 
+                                              when :attr:`soft_label` is :attr:`False`
+                                              and GPU is used. When :attr:`soft_label`
+                                              is :attr:`True` or CPU is used, the
                                               algorithm is always numerically stable.
                                               Note that the speed may be slower when use
                                               stable algorithm. Default: True.
         return_softmax (bool, optional): A flag indicating whether to return the softmax
                                          along with the cross entropy loss. Default: False.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
+        axis (int, optional): The index of dimension to perform softmax calculations. It
                               should be in range :math:`[-1, rank - 1]`, while :math:`rank`
                               is the rank of input :attr:`logits`. Default: -1.
 
@@ -1294,15 +1306,15 @@ def softmax_with_cross_entropy(logits,
 def rank_loss(label, left, right, name=None):
     r"""
 
-    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
-    with a training sample consisting of a pair of documents (A and B), The label (P) 
-    indicates whether A is ranked higher than B or not. Please refer to more details: 
+    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model
+    with a training sample consisting of a pair of documents (A and B), The label (P)
+    indicates whether A is ranked higher than B or not. Please refer to more details:
     `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
 
     Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
     label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
-    for documents A and B and the value of label P. Rank loss layer takes batch inputs 
-    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1}, 
+    for documents A and B and the value of label P. Rank loss layer takes batch inputs
+    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1},
     where 0.5 means that there is no information about the rank of the input pair.
     The following equation computes rank loss C_{i,j} from the inputs:
 
@@ -1370,7 +1382,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        left (Variable): Ranking score for left. Data type float32.
        right (Variable): Ranking score for right. Data type float32.
        margin (float): Indicates the given margin.
-       name(str|None): For detailed information, please refer to 
+       name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
@@ -1423,7 +1435,7 @@ def sigmoid_cross_entropy_with_logits(x,
                 as log(p/(1-p)) The data type should be float32 or float64.
         label (Tensor): a 2-D tensor of the same type and shape as X.
                 This input is a tensor of probabalistic labels for each logit.
-        ignore_index(int): Specifies a target value that is ignored and 
+        ignore_index(int): Specifies a target value that is ignored and
                 does not contribute to the input gradient.
         name(str|None): The default value is None.  Normally there is
             no need for user to set this property.  For more information,
@@ -1442,7 +1454,7 @@ def sigmoid_cross_entropy_with_logits(x,
 
             input = paddle.rand(shape=[10], dtype='float32')
             label = paddle.rand(shape=[10], dtype='float32')
-            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label, 
+            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label,
                                                             ignore_index=-1, normalize=True)
             print(loss)
     """
@@ -1493,7 +1505,7 @@ def teacher_student_sigmoid_loss(input,
 
     Examples:
         .. code-block:: python
-          
+
           import paddle.fluid as fluid
           import paddle
           paddle.enable_static()
@@ -1520,7 +1532,7 @@ def teacher_student_sigmoid_loss(input,
                 'Label': [label]},
         outputs={'Y': [out]},
         attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
-                "soft_max_up_bound": float(soft_max_up_bound)})
+               "soft_max_up_bound": float(soft_max_up_bound)})
     return out
 
 
@@ -1611,22 +1623,22 @@ def kldiv_loss(x, target, reduction='mean', name=None):
 
             import paddle
             import paddle.fluid as fluid
-            
+
             x = paddle.rand(shape=[3,4,2,2], dtype='float32')
             target = paddle.rand(shape=[3,4,2,2], dtype='float32')
 
             # 'batchmean' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
             print(loss.shape) # shape=[1]
-            
+
             # 'mean' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='mean')
             print(loss.shape) # shape=[1]
-            
+
             # 'sum' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='sum')
             print(loss.shape) # shape=[1]
-            
+
             # 'none' reduction, loss shape is same with X shape
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='none')
             print(loss.shape) # shape=[3, 4, 2, 2]
@@ -1652,42 +1664,42 @@ def kldiv_loss(x, target, reduction='mean', name=None):
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """ 
-  
+    """
+
     Npair loss requires paired data. Npair loss has two parts: the first part is L2
     regularizer on the embedding vector; the second part is cross entropy loss which
     takes the similarity matrix of anchor and positive as logits.
-  
+
     For more information, please refer to:
     `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-  
+
     Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
       labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
       l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
-  
+
     Returns:
       A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-  
+
     Examples:
 
       .. code-block:: python
-  
+
           import paddle
-          
+
           DATATYPE = "float32"
-  
+
           anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-          
+
           npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
           print(npair_loss)
-  
+
     """
     check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
                              'npair_loss')
@@ -1726,10 +1738,10 @@ def mse_loss(input, label):
     The loss can be described as:
 
     .. math::
-        
+
         Out = MEAN((input - label)^2)
 
-    Parameters: 
+    Parameters:
         input (Tensor): Input tensor, the data type should be float32.
         label (Tensor): Label tensor, the data type should be float32.
 
@@ -1737,7 +1749,7 @@ def mse_loss(input, label):
         Tensor: The tensor storing the mean square error difference of input and label.
 
     Return type: Tensor.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1751,37 +1763,3 @@ def mse_loss(input, label):
     check_variable_and_dtype(input, "input", ['float32', 'float64'], 'mse_loss')
     check_variable_and_dtype(label, "label", ['float32', 'float64'], 'mse_loss')
     return nn.reduce_mean(square_error_cost(input, label))
-
-
-def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
-    """
-
-    Returns:
-
-    """
-
-    if reduction not in ['sum', 'mean', 'none']:
-        raise ValueError(
-            "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
-
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'hinge_embedding_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'hinge_embedding_loss')
-
-    if (label == 1.).all():
-        loss = input
-    elif (label == -1.).all():
-        loss = paddle.maximum(paddle.to_tensor(0.), delta - input)
-    else:
-        raise ValueError("'label' should contain 1. or -1., "
-                         "but received label containing {}.".format(
-                             label.unique()))
-
-    if reduction == 'mean':
-        return paddle.mean(loss, name=name)
-    elif reduction == 'sum':
-        return paddle.sum(loss, name=name)
-    elif reduction == 'none':
-        return loss
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index d718eca9b7471..0e1a8cff750ea 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
 from __future__ import print_function
 
 import paddle
-import paddle.fluid as fluid
 import numpy as np
 import unittest
 
@@ -59,14 +58,14 @@ def run_dynamic_check(self):
         self.assertTrue(dy_result.shape, self.shape)
 
     def test_cpu(self):
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        paddle.disable_static(place=paddle.CPUPlace())
         self.run_dynamic_check()
 
     def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
+        if not paddle.is_compiled_with_cuda():
             return
 
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        paddle.disable_static(place=paddle.CUDAPlace(0))
         self.run_dynamic_check()
 
     # test case the raise message
@@ -127,14 +126,14 @@ def run_dynamic_check(self):
         self.assertTrue(dy_result.shape, self.shape)
 
     def test_cpu(self):
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        paddle.disable_static(place=paddle.CPUPlace())
         self.run_dynamic_check()
 
     def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
+        if not paddle.is_compiled_with_cuda():
             return
 
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        paddle.disable_static(place=paddle.CUDAPlace(0))
         self.run_dynamic_check()
 
     # test case the raise message
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1abe74e9783dc..ff6641098462a 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -101,6 +101,7 @@
 from .layer.loss import MarginRankingLoss  # noqa: F401
 from .layer.loss import CTCLoss  # noqa: F401
 from .layer.loss import SmoothL1Loss  # noqa: F401
+from .layer.loss import HingeEmbeddingLoss  # noqa: F401
 from .layer.norm import BatchNorm  # noqa: F401
 from .layer.norm import SyncBatchNorm  # noqa: F401
 from .layer.norm import GroupNorm  # noqa: F401
@@ -295,5 +296,6 @@ def weight_norm(*args):
            'ELU',
            'ReLU6',
            'LayerDict',
-           'ZeroPad2D'
+           'ZeroPad2D',
+           'HingeEmbeddingLoss'
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index cafec9d68ffd1..c998b00d9b989 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2086,12 +2086,12 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
             The shape of labelis the same as the shape of input.
         delta (float, optional): Has a default value of `1`.
-        reduction (string, optional): Specifies the reduction to apply to the output:
-            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-            ``'mean'``: the sum of the output will be divided by the number of
-            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'sum'``. Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -2128,10 +2128,10 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             "but received {}.".format(reduction))
 
     if not paddle.fluid.framework.in_dygraph_mode():
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'hinge_embedding_loss')
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'hinge_embedding_loss')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'hinge_embedding_loss')
+        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                 'hinge_embedding_loss')
 
     if set(label.unique().numpy()) <= {1., -1.}:
         loss = paddle.where(
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 7202964b33fff..d656a025a3022 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1233,12 +1233,12 @@ class HingeEmbeddingLoss(Layer):
 
     Parameters:
         delta (float, optional): Has a default value of `1`.
-        reduction (string, optional): Specifies the reduction to apply to the output:
-            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-            ``'mean'``: the sum of the output will be divided by the number of
-            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'sum'``. Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 

From c6bd8d48b6d1fc9c1df00d52c99b1b96726be714 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 17:33:20 +0800
Subject: [PATCH 06/23] fix doc

---
 python/paddle/nn/functional/loss.py | 2 +-
 python/paddle/nn/layer/loss.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c998b00d9b989..2288db9954420 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2091,7 +2091,7 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-            Default is ``'sum'``. Default: ``'mean'``
+            Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d656a025a3022..25330a16e9562 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1238,7 +1238,7 @@ class HingeEmbeddingLoss(Layer):
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-            Default is ``'sum'``. Default: ``'mean'``
+            Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 

From 341ba5b387e78f3249383da03b20ef934b2a2946 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 18:25:12 +0800
Subject: [PATCH 07/23] fix doc and rm python/paddle/fluid/layers/loss.py

---
 python/paddle/fluid/layers/loss.py  | 1765 ---------------------------
 python/paddle/nn/functional/loss.py |    5 +-
 python/paddle/nn/layer/loss.py      |    5 +-
 3 files changed, 8 insertions(+), 1767 deletions(-)
 delete mode 100644 python/paddle/fluid/layers/loss.py

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
deleted file mode 100644
index 456c7b072bf3d..0000000000000
--- a/python/paddle/fluid/layers/loss.py
+++ /dev/null
@@ -1,1765 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-from functools import partial, reduce
-import paddle
-from paddle.utils import deprecated
-from . import nn
-from .layer_function_generator import templatedoc
-from ..layer_helper import LayerHelper
-from ..framework import Variable, in_dygraph_mode, static_only
-from .. import core
-from ..data_feeder import check_variable_and_dtype, check_type
-from ..param_attr import ParamAttr
-from ..initializer import NumpyArrayInitializer, Constant
-from .. import core
-import warnings
-from paddle import _C_ops
-
-__all__ = [
-    'center_loss',
-    'bpr_loss',
-    'cross_entropy',
-    'square_error_cost',
-    'edit_distance',
-    'warpctc',
-    'nce',
-    'hsigmoid',
-    'sampled_softmax_with_cross_entropy',
-    'softmax_with_cross_entropy',
-    'rank_loss',
-    'margin_rank_loss',
-    'sigmoid_cross_entropy_with_logits',
-    'teacher_student_sigmoid_loss',
-    'huber_loss',
-    'kldiv_loss',
-    'npair_loss',
-    'mse_loss',
-]
-
-kIgnoreIndex = -100
-
-
-def center_loss(input,
-                label,
-                num_classes,
-                alpha,
-                param_attr,
-                update_center=True):
-    r"""
-    :api_attr: Static Graph
-
-    **Center loss Cost layer**
-
-    This OP accepts input (deep features,the output of the last hidden layer)
-    and target label and return the center loss cost. The average of the
-    distances of each sample in the mini-batch from the center of the
-    corresponding category is calculated as the center loss.
-
-    For deep features, :math:`X`, and target labels, :math:`Y`, the equation is:
-
-    .. math::
-
-        Out = \\frac{1}{2}(X - Y)^2
-
-    Args:
-        input (Variable): a 2-D tensor with shape[N x M]. Its dtype should be float32 or float64.
-        label (Variable): the groud truth which is a 2-D tensor
-                         with shape[N x 1],where N is the batch size. Its dtype should be int32.
-        num_classes (int): the number of classification categories.
-        alpha (float|Variable): learning rate of centers.
-        param_attr (ParamAttr): Attribute initializer of centers.
-        update_center (bool): whether to update value of center.
-
-    Returns:
-        Variable: 2-D tensor with shape [N * 1]
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle
-          paddle.enable_static()
-
-          input = fluid.data(name='x',shape=[20,30],dtype='float32')
-          label = fluid.data(name='y',shape=[20,1],dtype='int64')
-          num_classes = 1000
-          alpha = 0.01
-          param_attr = fluid.initializer.Xavier(uniform=False)
-          center_loss=fluid.layers.center_loss(input=input,
-                 label=label,
-                 num_classes=1000,
-                 alpha=alpha,
-                 param_attr=fluid.initializer.Xavier(uniform=False),
-                 update_center=True)
-    """
-    helper = LayerHelper('center_loss', **locals())
-    dtype = helper.input_dtype()
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'center_loss')
-    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'center_loss')
-
-    centers_shape = [num_classes, input.shape[1]]
-    centers_param = helper.create_parameter(
-        attr=param_attr, shape=centers_shape, dtype=dtype)
-    centers_param.stop_gradient = True
-
-    if isinstance(alpha, Variable):
-        alpha_param = alpha
-        check_variable_and_dtype(alpha, 'alpha', ['float32', 'float64'],
-                                 'center_loss')
-    else:
-        assert isinstance(alpha, float)
-        alpha_param = helper.create_variable(
-            name="centerloss_alpha",
-            shape=[1],
-            dtype="float32",
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            persistable=True,
-            stop_gradient=True,
-            initializer=Constant(alpha))
-
-    centersdiff = helper.create_variable_for_type_inference(dtype=input.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='center_loss',
-        inputs={
-            'X': [input],
-            'Label': [label],
-            'Centers': [centers_param],
-            'CenterUpdateRate': [alpha_param]
-        },
-        outputs={
-            'SampleCenterDiff': [centersdiff],
-            'Loss': [loss],
-            'CentersOut': [centers_param]
-        },
-        attrs={'cluster_num': num_classes,
-               'need_update': update_center})
-    return loss
-
-
-def bpr_loss(input, label, name=None):
-    r"""
-
-    **Bayesian Personalized Ranking Loss Operator**
-
-    This operator belongs to pairwise ranking loss. Label is the desired item.
-    The loss at a given point in one session is defined as:
-
-    .. math::
-        Y[i] = 1/(N[i] - 1) * \sum_j{\log(\sigma(X[i, Label[i]]-X[i, j]))}
-
-    Learn more details by reading paper <session-based recommendations with recurrent
-    neural networks>.
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
-                                batch size and D is the number of positive classes and negative classes
-                                This input is not probability but logits.
-        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
-                                is a tensor<int64> with shape [N x 1].
-        name (str|None):        A name for this layer(optional). If set None, the
-                                layer will be named automatically. Default: None.
-    Returns:
-        A 2-D tensor with shape [N x 1], the bpr loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle
-
-          paddle.enable_static()
-
-          neg_size = 10
-          label = fluid.data(
-                    name="label", shape=[3, 1], dtype="int64")
-          predict = fluid.data(
-                    name="predict", shape=[3, neg_size + 1], dtype="float32")
-          cost = fluid.layers.bpr_loss(input=predict, label=label)
-    """
-    helper = LayerHelper('bpr_loss', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'bpr_loss')
-    helper.append_op(
-        type='bpr_loss',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]})
-    return out
-
-
-def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
-    r"""
-    :alias_main: paddle.nn.functional.cross_entropy
-	:alias: paddle.nn.functional.cross_entropy,paddle.nn.functional.loss.cross_entropy
-	:old_api: paddle.fluid.layers.cross_entropy
-
-    This operator computes the cross entropy between input and label. It
-    supports both hard-label and and soft-label cross entropy computation.
-
-    1. Hard-label cross entropy: if soft_label=False, :math:`label[i_1, i_2, ..., i_k]`
-       is the hard label of each sample.
-
-        .. math::
-
-           output[i_1, i_2, ..., i_k]=-log(input[i_1, i_2, ..., i_k, j]), label[i_1, i_2, ..., i_k] = j, j != ignore\_index
-
-    2. Soft-label cross entropy: if soft_label=True,  :math:`label[i_1, i_2, ..., i_k, j]`
-       is the soft label of each sample corresponding to the j-th class.
-
-        .. math::
-
-           output[i_1, i_2, ..., i_k]= -\sum_{j}label[i_1,i_2,...,i_k,j]*log(input[i_1, i_2, ..., i_k,j])
-
-    Args:
-        input (Variable): a multidimensional Tensor with shape
-                :math:`[N_1, N_2, ..., N_k, D]`, where the last dimension D is
-                the class number. The data type should be float32 or float64.
-        label (Variable): label value corresponding to input. If
-                soft_label=False, the dimension of label should be :math:`[N_1, N_2, ..., N_k]`
-                or :math:`[N_1, N_2, ..., N_k, 1]` , and its data type should be int64,
-                and the value must be inside [0, D). If soft_label=True, the shape,
-                data type of label should be the same with input, and the sum of
-                soft label value of each sample should be 1.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        ignore_index (int): specify an ignorable label value. The ignored label would be
-                omitted when computing. If it is a negative integer, no label would
-                be ignored. Only valid when soft_label=False. Default -100.
-
-    Returns:
-         A Variable holding Tensor representing the cross entropy, whose data type is the same with input.
-         If soft_label=False, the shape of output is the same with label.
-         If soft_label=True, the shape of output is :math:`[N_1, N_2, ..., N_k, 1]` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            class_num = 7
-            x = fluid.data(name='x', shape=[None, 3, 10], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            predict = fluid.layers.fc(input=x, size=class_num, act='softmax')
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-    """
-    if not soft_label:
-        return cross_entropy2(input, label, ignore_index)
-
-    if in_dygraph_mode():
-        return _C_ops.cross_entropy(input, label, "soft_label", soft_label,
-                                    "ignore_index", ignore_index)
-
-    inputs = {'X': [input], 'Label': [label]}
-    attrs = {"soft_label": soft_label, "ignore_index": ignore_index}
-
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'cross_entropy')
-    helper = LayerHelper('cross_entropy', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy', inputs=inputs, outputs={'Y': [out]}, attrs=attrs)
-    return out
-
-
-def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
-    if in_dygraph_mode():
-        loss, _, _ = _C_ops.cross_entropy2(input, label, 'ignore_index',
-                                           ignore_index)
-        return loss
-
-    inputs = {'X': [input], 'Label': [label]}
-    attrs = {'ignore_index': ignore_index}
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'cross_entropy2')
-    helper = LayerHelper('cross_entropy2', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy2',
-        inputs=inputs,
-        outputs={'Y': [out],
-                 'MatchX': [match_x],
-                 'XShape': [xshape]},
-        attrs=attrs)
-    return out
-
-
-def square_error_cost(input, label):
-    r"""
-
-    This op accepts input predictions and target label and returns the
-    squared error cost.
-
-    For predictions label, and target label, the equation is:
-
-    .. math::
-
-        Out = (input - label)^2
-
-    Parameters:
-        input (Tensor): Input tensor, the data type should be float32.
-        label (Tensor): Label tensor, the data type should be float32.
-
-    Returns:
-        The tensor storing the element-wise squared error \
-                  difference between input and label.
-
-    Return type: Tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            input = paddle.to_tensor([1.1, 1.9])
-            label = paddle.to_tensor([1.0, 2.0])
-            output = paddle.nn.functional.square_error_cost(input, label)
-            print(output)
-            # [0.01, 0.01]
-
-    """
-    if in_dygraph_mode():
-        minus_out = _C_ops.elementwise_sub(input, label)
-        square_out = _C_ops.square(minus_out)
-        return square_out
-
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
-    helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
-    return square_out
-
-
-def edit_distance(input,
-                  label,
-                  normalized=True,
-                  ignored_tokens=None,
-                  input_length=None,
-                  label_length=None):
-    """
-    This op computes the edit distances, also called Levenshtein distance, between a batch of
-    hypothesis strings and their references. It measures how dissimilar two strings are by counting
-    the minimum number of operations to transform one string into another.
-    The operations include insertion, deletion, and substitution.
-
-    For example, given hypothesis string A = "kitten" and reference
-    B = "sitting", A will be transformed into B
-    at least after two substitutions and one insertion:
-
-    "kitten" -> "sitten" -> "sittin" -> "sitting"
-
-    So the edit distance between A and B is 3.
-
-    The input is a Tensor, the input_length and label_length should be supported.
-
-    The `batch_size` of labels should be same as `input`.
-
-    The output include the edit distance value between every pair of input and related label, and the number of sequence.
-    If Attr(normalized) is true,
-    the edit distance value will be divided by the length of label.
-
-    Parameters:
-        input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64.
-        label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64.
-        normalized(bool, default True): Indicated whether to normalize the edit distance.
-        ignored_tokens(list<int>, default None): Tokens that will be removed before
-                                     calculating edit distance.
-        input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
-        label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
-        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
-        NOTE: This Api is different from fluid.metrics.EditDistance
-
-    Returns:
-	Tuple:
-
-        distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
-        sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn.functional as F
-
-            input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64')
-            label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64')
-            input_len = paddle.to_tensor([3,3,3,3], dtype='int64')
-            label_len = paddle.to_tensor([4,4,4,4], dtype='int64')
-
-            distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False)
-
-            # print(distance)
-            # [[3.]
-            #  [2.]
-            #  [4.]
-            #  [1.]]
-            # if set normalized to True
-            # [[0.75]
-            #  [0.5 ]
-            #  [1.  ]
-            #  [0.25]
-            #
-            # print(sequence_num)
-            # [4]
-
-    """
-    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
-    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
-    helper = LayerHelper("edit_distance", **locals())
-
-    # remove some tokens from input and labels
-    if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_variable_for_type_inference(dtype="int64")
-        erased_label = helper.create_variable_for_type_inference(dtype="int64")
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
-        input = erased_input
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
-        label = erased_label
-
-    this_inputs = {"Hyps": [input], "Refs": [label]}
-    if input_length is not None and label_length is not None:
-        this_inputs['HypsLength'] = [input_length]
-        this_inputs['RefsLength'] = [label_length]
-
-    # edit distance op
-    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
-    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out, sequence_num
-
-
-def warpctc(input,
-            label,
-            blank=0,
-            norm_by_times=False,
-            input_length=None,
-            label_length=None):
-    """
-    An operator integrating the open source Warp-CTC library
-    (https://github.com/baidu-research/warp-ctc)
-    to compute Connectionist Temporal Classification (CTC) loss.
-    It can be aliased as softmax with CTC, since a native softmax activation is
-    interated to the Warp-CTC library to normalize values for each row of the
-    input tensor.
-
-    Args:
-       input (Variable): The unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
-         information. When it is a 2-D LodTensor, its shape is
-         `[Lp, num_classes + 1]`, where `Lp` is the sum of all input
-         sequences' length and `num_classes` is the true number of classes.
-         (not including the blank label). When it is a 3-D Tensor, its shape
-         is `[max_logit_length, batch_size, num_classes + 1]`,
-         where `max_logit_length` is the longest length of
-         input logit sequence. The data type should be float32 or float64.
-       label (Variable): The ground truth of variable-length sequence,
-         which must be a 2-D Tensor with LoD information or a 3-D Tensor without
-         LoD information, needs to be consistent with the coressponding input.
-         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum
-         of all labels' length. When it is a 3-D Tensor, its shape is
-         `[batch_size, max_label_length]`, where `max_label_length` is the longest
-         length of label sequence. Data type must be int32.
-       blank (int, default 0): The blank label index of Connectionist
-         Temporal Classification (CTC) loss, which is in the
-         half-opened interval `[0, num_classes + 1)`. The data type must be int32.
-       norm_by_times(bool, default false): Whether to normalize the gradients
-         by the number of time-step, which is also the sequence's length.
-         There is no need to normalize the gradients if warpctc layer was
-         followed by a mean_op.
-       input_length(Variable): The length for each input sequence if it is
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
-       label_length(Variable): The length for each label sequence if it is
-         of Tensor type, it should have shape `[batch_size]` and dtype int64.
-
-    Returns:
-        Variable: The Connectionist Temporal Classification (CTC) loss,
-        which is a 2-D Tensor with the shape `[batch_size, 1]`.
-        The date type is the same as input.
-
-    Examples:
-
-        .. code-block:: python
-
-            # using LoDTensor
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            # lengths of logit sequences
-            seq_lens = [2,6]
-            # lengths of label sequences
-            label_lens = [2,3]
-            # class num
-            class_num = 5
-
-            paddle.enable_static()
-            logits = fluid.data(name='logits',shape=[None, class_num+1],
-                                 dtype='float32',lod_level=1)
-            label = fluid.data(name='label', shape=[None, 1],
-                               dtype='int32', lod_level=1)
-            cost = fluid.layers.warpctc(input=logits, label=label)
-            place = fluid.CPUPlace()
-            x = fluid.create_lod_tensor(
-                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"),
-                     [seq_lens], place)
-            y = fluid.create_lod_tensor(
-                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"),
-                     [label_lens], place)
-            exe = fluid.Executor(place)
-            output= exe.run(fluid.default_main_program(),
-                            feed={"logits": x,"label": y},
-                            fetch_list=[cost.name])
-            print(output)
-
-        .. code-block:: python
-
-            # using Tensor
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            # length of the longest logit sequence
-            max_seq_length = 5
-            #length of the longest label sequence
-            max_label_length = 3
-            # number of logit sequences
-            batch_size = 16
-            # class num
-            class_num = 5
-            paddle.enable_static()
-            logits = fluid.data(name='logits',
-                           shape=[max_seq_length, batch_size, class_num+1],
-                           dtype='float32')
-            logits_length = fluid.data(name='logits_length', shape=[None],
-                             dtype='int64')
-            label = fluid.data(name='label', shape=[batch_size, max_label_length],
-                           dtype='int32')
-            label_length = fluid.data(name='labels_length', shape=[None],
-                             dtype='int64')
-            cost = fluid.layers.warpctc(input=logits, label=label,
-                            input_length=logits_length,
-                            label_length=label_length)
-            place = fluid.CPUPlace()
-            x = np.random.rand(max_seq_length, batch_size, class_num+1).astype("float32")
-            y = np.random.randint(0, class_num, [batch_size, max_label_length]).astype("int32")
-            exe = fluid.Executor(place)
-            output= exe.run(fluid.default_main_program(),
-                            feed={"logits": x,
-                                  "label": y,
-                                  "logits_length": np.array([max_seq_length]*batch_size).astype("int64"),
-                                  "labels_length": np.array([max_label_length]*batch_size).astype("int64")},
-                                  fetch_list=[cost.name])
-            print(output)
-    """
-    if in_dygraph_mode():
-        if input_length is None or label_length is None:
-            raise ValueError(
-                "input_length and label_length must not be None in dygraph mode!"
-            )
-        grad, loss_out = _C_ops.warpctc(
-            input,
-            label,
-            input_length,
-            label_length,
-            'blank',
-            blank,
-            'norm_by_times',
-            norm_by_times, )
-        return loss_out
-    helper = LayerHelper('warpctc', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
-    check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
-    this_inputs = {'Logits': [input], 'Label': [label]}
-    if input_length is not None and label_length is not None:
-        check_variable_and_dtype(input_length, 'LogitsLength', ['int64'],
-                                 "warpctc")
-        check_variable_and_dtype(label_length, 'LabelLength', ['int64'],
-                                 "warpctc")
-        this_inputs['LogitsLength'] = [input_length]
-        this_inputs['LabelLength'] = [label_length]
-
-    loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-
-    helper.append_op(
-        type='warpctc',
-        inputs=this_inputs,
-        outputs={'WarpCTCGrad': [grad_out],
-                 'Loss': [loss_out]},
-        attrs={
-            'blank': blank,
-            'norm_by_times': norm_by_times,
-        })
-    return loss_out
-
-
-# FIXME(wuyi): let docstring_checker.py understand @autodoc.
-# For now, the comments in c++ use types like Tensor, but in python side
-# the type is often "Variable", and arguments may vary.
-@static_only
-@templatedoc(op_type="nce")
-def nce(input,
-        label,
-        num_total_classes,
-        sample_weight=None,
-        param_attr=None,
-        bias_attr=None,
-        num_neg_samples=None,
-        name=None,
-        sampler="uniform",
-        custom_dist=None,
-        seed=0,
-        is_sparse=False):
-    """
-    :api_attr: Static Graph
-
-    ${comment}
-
-    Args:
-        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim],
-            and data type is float32 or float64.
-        label (Tensor): Input label, 2-D tensor with shape [batch_size, num_true_class],
-            and data type is int64.
-        num_total_classes (int):${num_total_classes_comment}.
-        sample_weight (Tensor|None): A Tensor of shape [batch_size, 1]
-            storing a weight for each sample. The default weight for each
-            sample is 1.0.
-        param_attr (ParamAttr|None): To specify the weight parameter attribute.
-            Default: None, which means the default weight parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
-            Default: None, which means the default bias parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        num_neg_samples (int): ${num_neg_samples_comment}.
-        name(str|None): For detailed information, please refer to
-            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-        sampler (str, optional): The sampler used to sample class from negative classes.
-                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
-                       default: 'uniform'.
-        custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
-                       It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probability of i-th class to be sampled.
-                       default: None.
-        seed (int, optional): The seed used in sampler. Default 0, means no random seed.
-        is_sparse(bool, optional): The flag indicating whether to use sparse update,
-            the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
-
-    Returns:
-        Tensor: The output nce loss.
-
-    Examples:
-        .. code-block:: python
-
-
-            import paddle
-            import numpy as np
-
-            paddle.enable_static()
-
-            window_size = 5
-            words = []
-            for i in range(window_size):
-                words.append(paddle.static.data(
-                    name='word_{0}'.format(i), shape=[-1, 1], dtype='int64'))
-
-            dict_size = 10000
-            label_word = int(window_size / 2) + 1
-
-            embs = []
-            for i in range(window_size):
-                if i == label_word:
-                    continue
-
-                emb = paddle.static.nn.embedding(input=words[i], size=[dict_size, 32],
-                                    param_attr='embed', is_sparse=True)
-                embs.append(emb)
-
-            embs = paddle.concat(x=embs, axis=1)
-            loss = paddle.static.nn.nce(input=embs, label=words[label_word],
-                        num_total_classes=dict_size, param_attr='nce.w_0',
-                        bias_attr='nce.b_0')
-
-            #or use custom distribution
-            dist = np.array([0.05,0.5,0.1,0.3,0.05])
-            loss = paddle.static.nn.nce(input=embs, label=words[label_word],
-                    num_total_classes=5, param_attr='nce.w_1',
-                    bias_attr='nce.b_1',
-                    num_neg_samples=3,
-                    sampler="custom_dist",
-                    custom_dist=dist)
-    """
-    helper = LayerHelper('nce', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nce')
-    check_variable_and_dtype(label, 'label', ['int64'], 'nce')
-
-    dim = input.shape[1]
-    num_true_class = label.shape[1]
-    w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[num_total_classes, dim],
-        is_bias=False,
-        dtype=input.dtype)
-    inputs = {}
-    if helper.bias_attr:
-        b = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=[num_total_classes, 1],
-            is_bias=True,
-            dtype=input.dtype)
-        inputs['Bias'] = b
-    cost = helper.create_variable_for_type_inference(dtype=input.dtype)
-    sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
-    sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
-
-    inputs['Input'] = input
-    inputs['Label'] = label
-    inputs['Weight'] = w
-    inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
-
-    if sampler == "uniform":
-        sampler = 0
-    elif sampler == "log_uniform":
-        sampler = 1
-    elif sampler == "custom_dist":
-        assert custom_dist is not None
-
-        custom_dist_len = num_total_classes
-        alias_probs_ = [0] * custom_dist_len
-        alias_ = [0] * custom_dist_len
-        bigs = []
-        littles = []
-        for i in range(custom_dist_len):
-            normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 0:
-                bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 0:
-                littles.append((i, normal_prob))
-            else:
-                alias_probs_[i] = normal_prob
-                alias_[i] = -1
-
-        while len(bigs) and len(littles):
-            big = bigs.pop(0)
-            little = littles.pop(0)
-
-            big_idx = big[0]
-            big_prob = big[1]
-
-            alias_probs_[little[0]] = little[1]
-            alias_[little[0]] = big_idx
-            big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 0:
-                bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 0:
-                littles.append((big_idx, big_left))
-            else:
-                alias_probs_[big_idx] = big_left
-                alias_[big_idx] = -1
-
-        if len(bigs):
-            big = bigs.pop(0)
-            alias_probs_[big[0]] = 1.0
-            alias_[big[0]] = -1
-        if len(littles):
-            little = littles.pop(0)
-            alias_probs_[little[0]] = 1.0
-            alias_[little[0]] = -1
-
-        def _init_by_numpy_array(numpy_array):
-            ret = helper.create_parameter(
-                attr=ParamAttr(),
-                shape=numpy_array.shape,
-                dtype=numpy_array.dtype,
-                default_initializer=NumpyArrayInitializer(numpy_array))
-            ret.stop_gradient = True
-            return ret
-
-        inputs['CustomDistProbs'] = _init_by_numpy_array(
-            np.array(custom_dist).astype('float32'))
-        inputs['CustomDistAlias'] = _init_by_numpy_array(
-            np.array(alias_).astype('int32'))
-        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
-            np.array(alias_probs_).astype('float32'))
-        sampler = 2
-    else:
-        raise Exception("Unsupported sampler type.")
-
-    if num_neg_samples is None:
-        num_neg_samples = 10
-    else:
-        num_neg_samples = int(num_neg_samples)
-
-    remote_prefetch = is_sparse
-    print(
-        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-    )
-
-    attrs = {
-        'num_total_classes': int(num_total_classes),
-        'num_neg_samples': num_neg_samples,
-        'seed': seed,
-        'sampler': sampler,
-        'is_sparse': is_sparse,
-        'remote_prefetch': remote_prefetch
-    }
-
-    helper.append_op(
-        type='nce',
-        inputs=inputs,
-        outputs={
-            'Cost': cost,
-            'SampleLogits': sample_logits,
-            'SampleLabels': sample_labels
-        },
-        attrs=attrs)
-    return cost / (num_neg_samples + 1)
-
-
-def hsigmoid(input,
-             label,
-             num_classes,
-             param_attr=None,
-             bias_attr=None,
-             name=None,
-             path_table=None,
-             path_code=None,
-             is_custom=False,
-             is_sparse=False):
-    """
-    :api_attr: Static Graph
-
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-
-    Parameters:
-        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
-            and D is the feature size. Its data type supports float32 and float64.
-        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
-            and data type is int64.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
-            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
-            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
-            initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
-            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
-            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
-            set, the bias is initialized zero. Default: None.
-        name (str, optional): Normally there is no need for user to set this property. For more information,
-            please refer to :ref:`api_guide_Name`. Default: None.
-        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
-            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
-            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
-            nodes' weight matrix. Default: None.
-        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
-            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
-            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, :attr:`path_table`,
-            :attr:`path_code` and :attr:`num_classes` should be set, otherwise :attr:`num_classes` should
-            be set. Default: False.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-
-    Returns:
-        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.fill_constant(shape=[4, 3], value=0.9, dtype='float32')
-            # x = [[0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9]]
-            y = fluid.layers.fill_constant(
-                shape=[4, 1], value=1, dtype='int64')
-            # y = [[1], [1], [1], [1]]
-            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2, param_attr=fluid.initializer.Constant(
-                value=0.05), bias_attr=fluid.initializer.Constant(value=.0))
-            # out = [[0.62792355], [0.62792355], [0.62792355], [0.62792355]]
-    """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'hsigmoid')
-    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid')
-
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    dim = input.shape[1]
-    if ((num_classes is None) or (num_classes < 2)) and (not is_custom):
-        raise ValueError(
-            "num_classes must not be less than 2 with default tree")
-
-    if (not is_custom) and (is_sparse):
-        print("Sparse mode should not be used without custom tree")
-        is_sparse = False
-
-    if (not is_custom) and ((path_table is not None) or
-                            (path_code is not None)):
-        raise ValueError(
-            "only num_classes should be passed without custom tree")
-
-    if (is_custom) and (path_code is None):
-        raise ValueError("path_code should not be None with custom tree")
-    elif (is_custom) and (path_table is None):
-        raise ValueError("path_table should not be None with custom tree")
-    elif (is_custom) and (num_classes is None):
-        raise ValueError("num_classes should not be None with custom tree")
-    else:
-        pass
-
-    weights = None
-    remote_prefetch = is_sparse
-    print(
-        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
-    )
-    if not is_custom:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes - 1, dim],
-            is_bias=False,
-            dtype=input.dtype)
-    else:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes, dim],
-            is_bias=False,
-            dtype=input.dtype)
-    inputs = {
-        "X": input,
-        "W": weights,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
-    if helper.bias_attr:
-        if not is_custom:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes - 1, 1],
-                is_bias=True,
-                dtype=input.dtype)
-            inputs['Bias'] = bias
-        else:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes, 1],
-                is_bias=True,
-                dtype=input.dtype)
-            inputs['Bias'] = bias
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs={"Out": out,
-                 "PreOut": pre_out,
-                 "W_Out": weights},
-        attrs={
-            "num_classes": num_classes,
-            "is_sparse": is_sparse,
-            "remote_prefetch": remote_prefetch
-        })
-    return out
-
-
-def sampled_softmax_with_cross_entropy(logits,
-                                       label,
-                                       num_samples,
-                                       num_true=1,
-                                       remove_accidental_hits=True,
-                                       use_customized_samples=False,
-                                       customized_samples=None,
-                                       customized_probabilities=None,
-                                       seed=0):
-    """
-    **Sampled Softmax With Cross Entropy Operator.**
-
-    Cross entropy loss with sampled softmax is used as the output layer for
-    larger output classes extensively. This operator samples a number of samples
-    for all examples, and computes the softmax normalized values for each
-    row of the sampled tensor, after which cross-entropy loss is computed.
-
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
-
-    For examples with T true labels (T >= 1), we assume that each true label has
-    a probability of 1/T. For each sample, S samples are generated using a
-    log uniform distribution. True labels are concatenated with these samples to
-    form T + S samples for each example. So, assume the shape of logits is
-    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
-    probability is calculated, which corresponds to the Q(y|x) in
-    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
-
-    Logits are sampled according to the sampled labels. Then if
-    remove_accidental_hits is True, if a sample[i, j] accidentally hits true
-    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
-    make its softmax result close to zero. Then sampled logits are subtracted by
-    logQ(y|x), these sampled logits and re-indexed labels are used to compute
-    a softmax with cross entropy.
-
-    Args:
-        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
-            with shape [N x K]. N is the batch_size, and K is the class number.
-        label (Variable): The ground truth which is a 2-D tensor. Label is a
-            Tensor<int64> with shape [N x T], where T is the number of true
-            labels per example.
-        num_samples (int): The number for each example, num_samples should be
-            less than the number of class.
-        num_true(int): The number of target classes per training example.
-        remove_accidental_hits (bool): A flag indicating whether to remove
-            accidental hits when sampling. If True and if a sample[i, j]
-            accidentally hits true labels, then the corresponding
-            sampled_logits[i, j] is minus by 1e20 to make its softmax result
-            close to zero. Default is True.
-        use_customized_samples (bool): Whether to use custom samples and probabities to sample
-            logits.
-        customized_samples (Variable): User defined samples, which is a 2-D tensor
-            with shape [N, T + S]. S is the num_samples, and T is the number of true
-            labels per example.
-        customized_probabilities (Variable): User defined probabilities of samples,
-            a 2-D tensor which has the same shape with customized_samples.
-        seed (int): The random seed for generating random number, which is used
-            in the process of sampling. Default is 0.
-
-    Returns:
-        Variable: Return the cross entropy loss which is a 2-D tensor with shape
-                  [N x 1].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name='data', shape=[256], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            fc = fluid.layers.fc(input=input, size=100)
-            out = fluid.layers.sampled_softmax_with_cross_entropy(
-                      logits=fc, label=label, num_samples=25)
-    """
-    helper = LayerHelper('sample_logits', **locals())
-    samples = customized_samples if use_customized_samples else helper.create_variable_for_type_inference(
-        dtype='int64')
-    probabilities = customized_probabilities if use_customized_samples else helper.create_variable_for_type_inference(
-        dtype=logits.dtype)
-    sampled_logits \
-        = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
-    sampled_softlabel = helper.create_variable_for_type_inference(
-        dtype=logits.dtype)
-    logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
-
-    helper.append_op(
-        type='sample_logits',
-        inputs={
-            'Logits': logits,
-            'Labels': label,
-            'CustomizedSamples': customized_samples,
-            'CustomizedProbabilities': customized_probabilities
-        },
-        outputs={
-            'Samples': samples,
-            'Probabilities': probabilities,
-            'SampledLabels': sampled_label,
-            'SampledLogits': sampled_logits,
-            'LogitsDim': logits_dim,
-            'LabelsDim': labels_dim
-        },
-        attrs={
-            'use_customized_samples': use_customized_samples,
-            'uniq': True,
-            'remove_accidental_hits': remove_accidental_hits,
-            'num_samples': num_samples,
-            'seed': seed
-        })
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(
-        type='one_hot',
-        inputs={'X': sampled_label},
-        attrs={'depth': num_samples + 1},
-        outputs={'Out': sampled_softlabel})
-
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': sampled_logits,
-                'Label': sampled_softlabel},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs={
-            'soft_label': True,
-            'ignore_index': False,
-            'numeric_stable_mode': False
-        })
-    return loss / num_true
-
-
-def softmax_with_cross_entropy(logits,
-                               label,
-                               soft_label=False,
-                               ignore_index=kIgnoreIndex,
-                               numeric_stable_mode=True,
-                               return_softmax=False,
-                               axis=-1):
-    r"""
-
-    This operator implements the cross entropy loss function with softmax. This function
-    combines the calculation of the softmax operation and the cross entropy loss function
-    to provide a more numerically stable gradient.
-
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
-
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators
-    expects mutually exclusive hard labels, each sample in a batch is in exactly
-    one class with a probability of 1.0. Each sample in the batch will have a
-    single label.
-
-    The equation is as follows:
-
-    1) Hard label (one-hot label, so every sample has exactly one class)
-
-    .. math::
-
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
-
-    2) Soft label (each sample can have a distribution over all classes)
-
-    .. math::
-
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
-
-    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated first by:
-
-    .. math::
-
-        max_j &= \\max_{i=0}^{K}{\\text{logits}_i}
-
-        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logits_i - max_j)
-
-        softmax_j &= \\exp(logits_j - max_j - {log\\_max\\_sum}_j)
-
-    and then cross entropy loss is calculated by softmax and label.
-
-    Args:
-        logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
-        label (Tensor): The ground truth  ``Tensor`` , data type is the same
-            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`,
-            Label is a ``Tensor``  in the same shape with :attr:`logits`.
-            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor``
-            in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
-        soft_label (bool, optional): A flag to indicate whether to interpretant the given
-            labels as soft labels. Default False.
-        ignore_index (int, optional): Specifies a target value that is ignored and does
-                                      not contribute to the input gradient. Only valid
-                                      if :attr:`soft_label` is set to :attr:`False`.
-                                      Default: kIgnoreIndex(-100).
-        numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
-                                              numerically stable algorithm. Only valid
-                                              when :attr:`soft_label` is :attr:`False`
-                                              and GPU is used. When :attr:`soft_label`
-                                              is :attr:`True` or CPU is used, the
-                                              algorithm is always numerically stable.
-                                              Note that the speed may be slower when use
-                                              stable algorithm. Default: True.
-        return_softmax (bool, optional): A flag indicating whether to return the softmax
-                                         along with the cross entropy loss. Default: False.
-        axis (int, optional): The index of dimension to perform softmax calculations. It
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
-
-    Returns:
-        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
-                                                    `return_softmax` is False, otherwise the tuple \
-                                                    (loss, softmax), softmax is in the same shape \
-                                                    with input logits and cross entropy loss is in \
-                                                    the same shape with input logits except shape \
-                                                    in dimension :attr:`axis` as 1.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-
-            data = np.random.rand(128).astype("float32")
-            label = np.random.rand(1).astype("int64")
-            data = paddle.to_tensor(data)
-            label = paddle.to_tensor(label)
-            linear = paddle.nn.Linear(128, 100)
-            x = linear(data)
-            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
-            print(out)
-    """
-    if in_dygraph_mode():
-        if core.is_compiled_with_npu():
-            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
-        else:
-            softmax, loss = _C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'soft_label': soft_label,
-        'ignore_index': ignore_index,
-        'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
-    }
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-
-    outputs = {'Softmax': softmax, 'Loss': loss}
-    if core.is_compiled_with_npu():
-        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
-
-
-def rank_loss(label, left, right, name=None):
-    r"""
-
-    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model
-    with a training sample consisting of a pair of documents (A and B), The label (P)
-    indicates whether A is ranked higher than B or not. Please refer to more details:
-    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
-
-    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
-    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
-    for documents A and B and the value of label P. Rank loss layer takes batch inputs
-    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1},
-    where 0.5 means that there is no information about the rank of the input pair.
-    The following equation computes rank loss C_{i,j} from the inputs:
-
-    .. math::
-      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
-    .. math::
-      o_{i,j} &=  o_i - o_j  \\\\
-    .. math::
-      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
-
-    Parameters:
-        label (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32, batch indicates the size of the data. Indicats whether A ranked higher than B or not.
-        left (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc A.
-        right (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc B.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: ``Tensor`` indicating the output value of the sort loss layer, the data type is float32, and the return value's shape is :math:`[batch,1]` .
-
-    Raises:
-        ValueError: Any of label, left, and right is not a ``Variable`` .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
-            left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
-            right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
-            out = fluid.layers.rank_loss(label, left, right)
-
-    """
-    helper = LayerHelper('rank_loss', **locals())
-    check_variable_and_dtype(label, 'label', ['float32'], "rank_loss")
-    check_variable_and_dtype(left, 'left', ['float32'], "rank_loss")
-    check_variable_and_dtype(right, 'right', ['float32'], "rank_loss")
-
-    out = helper.create_variable_for_type_inference("float32")
-
-    helper.append_op(
-        type='rank_loss',
-        inputs={"Label": label,
-                "Left": left,
-                "Right": right},
-        outputs={'Out': out})
-    return out
-
-
-def margin_rank_loss(label, left, right, margin=0.1, name=None):
-    r"""
-    Margin Ranking Loss Layer for ranking problem,
-    which compares left score and right score passed in.
-    The ranking loss can be defined as following equation:
-
-    .. math::
-
-        rank\_loss = max(0, -label * (left - right) + margin)
-
-    Args:
-       label (Variable): Indicates whether the left is ranked higher than the right or not.
-           Data type is float32.
-       left (Variable): Ranking score for left. Data type float32.
-       right (Variable): Ranking score for right. Data type float32.
-       margin (float): Indicates the given margin.
-       name(str|None): For detailed information, please refer to
-           :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-
-    Returns:
-       Variable: The ranking loss.
-
-    Raises:
-       ValueError: Any of label, left, and right is not a Variable.
-
-    Examples:
-
-        .. code-block:: python
-
-           import paddle.fluid as fluid
-           label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
-           left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
-           right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
-           out = fluid.layers.margin_rank_loss(label, left, right)
-    """
-    helper = LayerHelper('margin_rank_loss', **locals())
-    check_variable_and_dtype(label, 'label', ['float32'], 'margin_rank_loss')
-    check_variable_and_dtype(label, 'left', ['float32'], 'margin_rank_loss')
-    check_variable_and_dtype(label, 'right', ['float32'], 'margin_rank_loss')
-    out = helper.create_variable_for_type_inference(left.dtype)
-    act = helper.create_variable_for_type_inference(left.dtype)
-    helper.append_op(
-        type='margin_rank_loss',
-        inputs={"Label": label,
-                "X1": left,
-                "X2": right},
-        outputs={'Out': out,
-                 'Activated': act},
-        attrs={'margin': margin})
-    return out
-
-
-@templatedoc()
-def sigmoid_cross_entropy_with_logits(x,
-                                      label,
-                                      ignore_index=kIgnoreIndex,
-                                      name=None,
-                                      normalize=False):
-    """
-
-    ${comment}
-
-    Args:
-        x(Tensor): a 2-D tensor with shape N x D, where N is the batch size and
-                D is the number of classes. This input is a tensor of logits computed
-                by the previous operator. Logits are unscaled log probabilities given
-                as log(p/(1-p)) The data type should be float32 or float64.
-        label (Tensor): a 2-D tensor of the same type and shape as X.
-                This input is a tensor of probabalistic labels for each logit.
-        ignore_index(int): Specifies a target value that is ignored and
-                does not contribute to the input gradient.
-        name(str|None): The default value is None.  Normally there is
-            no need for user to set this property.  For more information,
-            please refer to :ref:`api_guide_Name`
-        normalize(bool): If true, divide the output by the number of
-            targets != ignore_index.
-
-    Returns:
-        out(Tensor): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-
-            import paddle
-
-            input = paddle.rand(shape=[10], dtype='float32')
-            label = paddle.rand(shape=[10], dtype='float32')
-            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label,
-                                                            ignore_index=-1, normalize=True)
-            print(loss)
-    """
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             'sigmoid_cross_entropy_with_logits')
-
-    helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="sigmoid_cross_entropy_with_logits",
-        inputs={"X": x,
-                "Label": label},
-        attrs={"ignore_index": ignore_index,
-               'normalize': normalize},
-        outputs={"Out": out})
-    return out
-
-
-def teacher_student_sigmoid_loss(input,
-                                 label,
-                                 soft_max_up_bound=15.0,
-                                 soft_max_lower_bound=-15.0):
-    """
-
-    **Teacher Student Log Loss Layer**
-
-    This layer accepts input predictions and target label and returns the
-    teacher_student loss. Z is click or not, z' is value of teacher loss, label = {-2, -1, [0, 2]}
-    when z' is not exist, clk = 0 : label = -2; when z' is not exist, clk = 1 : label = -1;
-    when z' is exist    , clk = 0 : label = 0 + z'; when z' is exist    , clk = 1 : label = 1 + z'
-
-    .. math::
-        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
-
-    Args:
-        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
-                                batch size. This input is a probability computed
-                                by the previous operator.
-        label (Variable|list):  the ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
-        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
-
-    Returns:
-        Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle
-          paddle.enable_static()
-          batch_size = 64
-          label = fluid.data(
-                    name="label", shape=[batch_size, 1], dtype="int64")
-          similarity = fluid.data(
-                    name="similarity", shape=[batch_size, 1], dtype="float32")
-          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
-
-    """
-    check_variable_and_dtype(input, "input",
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'teacher_student_sigmoid_loss')
-    check_variable_and_dtype(label, "label",
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'teacher_student_sigmoid_loss')
-
-    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
-    out = helper.create_variable(dtype=input.dtype)
-    helper.append_op(
-        type='teacher_student_sigmoid_loss',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
-               "soft_max_up_bound": float(soft_max_up_bound)})
-    return out
-
-
-def huber_loss(input, label, delta):
-    r"""
-    This operator computes the Huber loss between input and label.
-    Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers.
-
-    When the absolute difference between input and label is greater than delta, the linear error is calculated:
-
-    .. math::
-            huber\_loss = delta * (label - input) - 0.5 * delta * delta
-
-    When the absolute difference between input and label is greater than delta, the square error is calculated:
-
-    .. math::
-            huber\_loss = 0.5 * (label - input) * (label - input)
-
-
-    Args:
-        input (Variable): Predicted data, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32.
-        label (Variable): Ground truth label, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32.
-        delta (float): The threshold for Huber loss, which is used to control the balance between the linear error and square error. The data type should be float32.
-
-    Returns:
-        Variable: The huber loss, a tensor with the same shape and data type as input.
-
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle.fluid as fluid
-        import numpy as np
-
-        DATATYPE='float32'
-        input_data = np.array([[1.],[2.],[3.],[4.]]).astype(DATATYPE)
-        label_data = np.array([[3.],[3.],[4.],[4.]]).astype(DATATYPE)
-
-        x = fluid.data(name='input', shape=[None, 1], dtype=DATATYPE)
-        y = fluid.data(name='label', shape=[None, 1], dtype=DATATYPE)
-        loss = fluid.layers.huber_loss(input=x, label=y, delta=1.0)
-
-        place = fluid.CPUPlace()
-        #place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        HuberLoss, = exe.run(feed={'input':input_data ,'label':label_data}, fetch_list=[loss.name])
-        print(HuberLoss)  #[[1.5], [0.5], [0.5], [0. ]], dtype=float32
-    """
-    helper = LayerHelper('huber_loss', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'huber_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'huber_loss')
-    residual = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='huber_loss',
-        inputs={'X': input,
-                'Y': label},
-        outputs={'Out': out,
-                 'Residual': residual},
-        attrs={'delta': delta})
-    return out
-
-
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div")
-@templatedoc()
-def kldiv_loss(x, target, reduction='mean', name=None):
-    """
-
-    ${comment}
-
-    Args:
-        x (Tensor): ${x_comment}
-        target (Tensor): ${target_comment}
-        reduction (Tensor): ${reduction_comment}
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Returns:
-        Tensor: The KL divergence loss. The data type is same as input tensor
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            x = paddle.rand(shape=[3,4,2,2], dtype='float32')
-            target = paddle.rand(shape=[3,4,2,2], dtype='float32')
-
-            # 'batchmean' reduction, loss shape will be [1]
-            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
-            print(loss.shape) # shape=[1]
-
-            # 'mean' reduction, loss shape will be [1]
-            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='mean')
-            print(loss.shape) # shape=[1]
-
-            # 'sum' reduction, loss shape will be [1]
-            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='sum')
-            print(loss.shape) # shape=[1]
-
-            # 'none' reduction, loss shape is same with X shape
-            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='none')
-            print(loss.shape) # shape=[3, 4, 2, 2]
-
-    """
-    helper = LayerHelper('kldiv_loss', **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'kldiv_loss')
-    check_variable_and_dtype(target, 'target', ['float32', 'float64'],
-                             'kldiv_loss')
-    check_type(reduction, 'reduction', str, 'kldiv_loss')
-    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='kldiv_loss',
-        inputs={'X': x,
-                'Target': target},
-        outputs={'Loss': loss},
-        attrs={'reduction': reduction})
-    return loss
-
-
-from .ops import square
-from .control_flow import equal
-
-
-def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """
-
-    Npair loss requires paired data. Npair loss has two parts: the first part is L2
-    regularizer on the embedding vector; the second part is cross entropy loss which
-    takes the similarity matrix of anchor and positive as logits.
-
-    For more information, please refer to:
-    `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-
-    Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
-                        the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
-                        the data type is float32 or float64.
-      labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
-      l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
-
-
-    Returns:
-      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-
-    Examples:
-
-      .. code-block:: python
-
-          import paddle
-
-          DATATYPE = "float32"
-
-          anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
-          positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
-          labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-
-          npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
-          print(npair_loss)
-
-    """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
-    Beta = 0.25
-    batch_size = labels.shape[0]
-
-    labels = nn.reshape(labels, shape=[batch_size, 1])
-    labels = paddle.tile(labels, repeat_times=[1, batch_size])
-
-    labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
-    labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
-
-    l2loss = nn.reduce_mean(nn.reduce_sum(square(anchor), 1)) \
-             + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
-    l2loss = l2loss * Beta * l2_reg
-
-    similarity_matrix = paddle.matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
-    cross_entropy = nn.reduce_sum(labels * softmax_ce, 0)
-    celoss = nn.reduce_mean(cross_entropy)
-
-    return l2loss + celoss
-
-
-def mse_loss(input, label):
-    """
-
-    This op accepts input predications and target label and returns the mean square error.
-
-    The loss can be described as:
-
-    .. math::
-
-        Out = MEAN((input - label)^2)
-
-    Parameters:
-        input (Tensor): Input tensor, the data type should be float32.
-        label (Tensor): Label tensor, the data type should be float32.
-
-    Returns:
-        Tensor: The tensor storing the mean square error difference of input and label.
-
-    Return type: Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            input = paddle.to_tensor([1.1, 1.9])
-            label = paddle.to_tensor([1.0, 2.0])
-            output = paddle.fluid.layers.mse_loss(input, label)
-            print(output.numpy())
-            # [0.01]
-    """
-    check_variable_and_dtype(input, "input", ['float32', 'float64'], 'mse_loss')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'], 'mse_loss')
-    return nn.reduce_mean(square_error_cost(input, label))
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 2288db9954420..f8e1ab447fcb2 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2085,7 +2085,10 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             is (N, C, D1, D2,..., Dk), k >= 1.
         label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
             The shape of labelis the same as the shape of input.
-        delta (float, optional): Has a default value of `1`.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the input need to be to calculate in
+            hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
+            Default = 1.0
         reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 25330a16e9562..ae6383a392c86 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1232,7 +1232,10 @@ class HingeEmbeddingLoss(Layer):
     where :math:`L = \{l_1,\dots,l_N\}^\top`.
 
     Parameters:
-        delta (float, optional): Has a default value of `1`.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the input need to be to calculate in
+            hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
+            Default = 1.0
         reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;

From a20b2de8fe26f5e1622a5a4351cd0c6ed87fad14 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 18:59:57 +0800
Subject: [PATCH 08/23] get raw python/paddle/fluid/layers/loss.py back

---
 python/paddle/fluid/layers/loss.py | 1765 ++++++++++++++++++++++++++++
 1 file changed, 1765 insertions(+)
 create mode 100644 python/paddle/fluid/layers/loss.py

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
new file mode 100644
index 0000000000000..3db4a894d1a07
--- /dev/null
+++ b/python/paddle/fluid/layers/loss.py
@@ -0,0 +1,1765 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+from functools import partial, reduce
+import paddle
+from paddle.utils import deprecated
+from . import nn
+from .layer_function_generator import templatedoc
+from ..layer_helper import LayerHelper
+from ..framework import Variable, in_dygraph_mode, static_only
+from .. import core
+from ..data_feeder import check_variable_and_dtype, check_type
+from ..param_attr import ParamAttr
+from ..initializer import NumpyArrayInitializer, Constant
+from .. import core
+import warnings
+from paddle import _C_ops
+
+__all__ = [
+    'center_loss',
+    'bpr_loss',
+    'cross_entropy',
+    'square_error_cost',
+    'edit_distance',
+    'warpctc',
+    'nce',
+    'hsigmoid',
+    'sampled_softmax_with_cross_entropy',
+    'softmax_with_cross_entropy',
+    'rank_loss',
+    'margin_rank_loss',
+    'sigmoid_cross_entropy_with_logits',
+    'teacher_student_sigmoid_loss',
+    'huber_loss',
+    'kldiv_loss',
+    'npair_loss',
+    'mse_loss',
+]
+
+kIgnoreIndex = -100
+
+
+def center_loss(input,
+                label,
+                num_classes,
+                alpha,
+                param_attr,
+                update_center=True):
+    r"""
+    :api_attr: Static Graph
+
+    **Center loss Cost layer**
+    
+    This OP accepts input (deep features,the output of the last hidden layer)
+    and target label and return the center loss cost. The average of the 
+    distances of each sample in the mini-batch from the center of the 
+    corresponding category is calculated as the center loss.
+    
+    For deep features, :math:`X`, and target labels, :math:`Y`, the equation is:
+    
+    .. math::
+
+        Out = \\frac{1}{2}(X - Y)^2
+
+    Args:
+        input (Variable): a 2-D tensor with shape[N x M]. Its dtype should be float32 or float64.
+        label (Variable): the groud truth which is a 2-D tensor
+                         with shape[N x 1],where N is the batch size. Its dtype should be int32.
+        num_classes (int): the number of classification categories.
+        alpha (float|Variable): learning rate of centers.
+        param_attr (ParamAttr): Attribute initializer of centers. 
+        update_center (bool): whether to update value of center.
+
+    Returns:
+        Variable: 2-D tensor with shape [N * 1] 
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid 
+          import paddle
+          paddle.enable_static()
+
+          input = fluid.data(name='x',shape=[20,30],dtype='float32')
+          label = fluid.data(name='y',shape=[20,1],dtype='int64')
+          num_classes = 1000
+          alpha = 0.01
+          param_attr = fluid.initializer.Xavier(uniform=False)
+          center_loss=fluid.layers.center_loss(input=input,
+                 label=label,
+                 num_classes=1000,
+                 alpha=alpha,
+                 param_attr=fluid.initializer.Xavier(uniform=False),
+                 update_center=True)
+    """
+    helper = LayerHelper('center_loss', **locals())
+    dtype = helper.input_dtype()
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'center_loss')
+    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'center_loss')
+
+    centers_shape = [num_classes, input.shape[1]]
+    centers_param = helper.create_parameter(
+        attr=param_attr, shape=centers_shape, dtype=dtype)
+    centers_param.stop_gradient = True
+
+    if isinstance(alpha, Variable):
+        alpha_param = alpha
+        check_variable_and_dtype(alpha, 'alpha', ['float32', 'float64'],
+                                 'center_loss')
+    else:
+        assert isinstance(alpha, float)
+        alpha_param = helper.create_variable(
+            name="centerloss_alpha",
+            shape=[1],
+            dtype="float32",
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=True,
+            stop_gradient=True,
+            initializer=Constant(alpha))
+
+    centersdiff = helper.create_variable_for_type_inference(dtype=input.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='center_loss',
+        inputs={
+            'X': [input],
+            'Label': [label],
+            'Centers': [centers_param],
+            'CenterUpdateRate': [alpha_param]
+        },
+        outputs={
+            'SampleCenterDiff': [centersdiff],
+            'Loss': [loss],
+            'CentersOut': [centers_param]
+        },
+        attrs={'cluster_num': num_classes,
+               'need_update': update_center})
+    return loss
+
+
+def bpr_loss(input, label, name=None):
+    r"""
+
+    **Bayesian Personalized Ranking Loss Operator**
+
+    This operator belongs to pairwise ranking loss. Label is the desired item.
+    The loss at a given point in one session is defined as:
+
+    .. math::
+        Y[i] = 1/(N[i] - 1) * \sum_j{\log(\sigma(X[i, Label[i]]-X[i, j]))}
+
+    Learn more details by reading paper <session-based recommendations with recurrent
+    neural networks>.
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+                                batch size and D is the number of positive classes and negative classes
+                                This input is not probability but logits.
+        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
+                                is a tensor<int64> with shape [N x 1].
+        name (str|None):        A name for this layer(optional). If set None, the
+                                layer will be named automatically. Default: None.
+    Returns:
+        A 2-D tensor with shape [N x 1], the bpr loss.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
+
+          neg_size = 10
+          label = fluid.data(
+                    name="label", shape=[3, 1], dtype="int64")
+          predict = fluid.data(
+                    name="predict", shape=[3, neg_size + 1], dtype="float32")
+          cost = fluid.layers.bpr_loss(input=predict, label=label)
+    """
+    helper = LayerHelper('bpr_loss', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+                             'bpr_loss')
+    helper.append_op(
+        type='bpr_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]})
+    return out
+
+
+def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
+    r"""
+    :alias_main: paddle.nn.functional.cross_entropy
+	:alias: paddle.nn.functional.cross_entropy,paddle.nn.functional.loss.cross_entropy
+	:old_api: paddle.fluid.layers.cross_entropy
+
+    This operator computes the cross entropy between input and label. It
+    supports both hard-label and and soft-label cross entropy computation.
+
+    1. Hard-label cross entropy: if soft_label=False, :math:`label[i_1, i_2, ..., i_k]`
+       is the hard label of each sample.
+
+        .. math::
+
+           output[i_1, i_2, ..., i_k]=-log(input[i_1, i_2, ..., i_k, j]), label[i_1, i_2, ..., i_k] = j, j != ignore\_index
+
+    2. Soft-label cross entropy: if soft_label=True,  :math:`label[i_1, i_2, ..., i_k, j]`
+       is the soft label of each sample corresponding to the j-th class.
+
+        .. math::
+
+           output[i_1, i_2, ..., i_k]= -\sum_{j}label[i_1,i_2,...,i_k,j]*log(input[i_1, i_2, ..., i_k,j])
+
+    Args:
+        input (Variable): a multidimensional Tensor with shape
+                :math:`[N_1, N_2, ..., N_k, D]`, where the last dimension D is
+                the class number. The data type should be float32 or float64.
+        label (Variable): label value corresponding to input. If
+                soft_label=False, the dimension of label should be :math:`[N_1, N_2, ..., N_k]`
+                or :math:`[N_1, N_2, ..., N_k, 1]` , and its data type should be int64,
+                and the value must be inside [0, D). If soft_label=True, the shape,
+                data type of label should be the same with input, and the sum of
+                soft label value of each sample should be 1.
+        soft_label (bool): indicate whether label is soft. Default False, meaning that
+                the label is hard. If soft_label=True, the label is soft.
+        ignore_index (int): specify an ignorable label value. The ignored label would be
+                omitted when computing. If it is a negative integer, no label would
+                be ignored. Only valid when soft_label=False. Default -100.
+
+    Returns:
+         A Variable holding Tensor representing the cross entropy, whose data type is the same with input.
+         If soft_label=False, the shape of output is the same with label.
+         If soft_label=True, the shape of output is :math:`[N_1, N_2, ..., N_k, 1]` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            class_num = 7
+            x = fluid.data(name='x', shape=[None, 3, 10], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            predict = fluid.layers.fc(input=x, size=class_num, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+    """
+    if not soft_label:
+        return cross_entropy2(input, label, ignore_index)
+
+    if in_dygraph_mode():
+        return _C_ops.cross_entropy(input, label, "soft_label", soft_label,
+                                    "ignore_index", ignore_index)
+
+    inputs = {'X': [input], 'Label': [label]}
+    attrs = {"soft_label": soft_label, "ignore_index": ignore_index}
+
+    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+                             'cross_entropy')
+    helper = LayerHelper('cross_entropy', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy', inputs=inputs, outputs={'Y': [out]}, attrs=attrs)
+    return out
+
+
+def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
+    if in_dygraph_mode():
+        loss, _, _ = _C_ops.cross_entropy2(input, label, 'ignore_index',
+                                           ignore_index)
+        return loss
+
+    inputs = {'X': [input], 'Label': [label]}
+    attrs = {'ignore_index': ignore_index}
+    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+                             'cross_entropy2')
+    helper = LayerHelper('cross_entropy2', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
+    match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy2',
+        inputs=inputs,
+        outputs={'Y': [out],
+                 'MatchX': [match_x],
+                 'XShape': [xshape]},
+        attrs=attrs)
+    return out
+
+
+def square_error_cost(input, label):
+    r"""
+
+    This op accepts input predictions and target label and returns the
+    squared error cost.
+
+    For predictions label, and target label, the equation is:
+
+    .. math::
+
+        Out = (input - label)^2
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32.
+        label (Tensor): Label tensor, the data type should be float32.
+
+    Returns:
+        The tensor storing the element-wise squared error \
+                  difference between input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            input = paddle.to_tensor([1.1, 1.9])
+            label = paddle.to_tensor([1.0, 2.0])
+            output = paddle.nn.functional.square_error_cost(input, label)
+            print(output)
+            # [0.01, 0.01]
+
+    """
+    if in_dygraph_mode():
+        minus_out = _C_ops.elementwise_sub(input, label)
+        square_out = _C_ops.square(minus_out)
+        return square_out
+
+    check_variable_and_dtype(input, "input", ['float32', 'float64'],
+                             'square_error_cost')
+    check_variable_and_dtype(label, "label", ['float32', 'float64'],
+                             'square_error_cost')
+    helper = LayerHelper('square_error_cost', **locals())
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def edit_distance(input,
+                  label,
+                  normalized=True,
+                  ignored_tokens=None,
+                  input_length=None,
+                  label_length=None):
+    """
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
+    The operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", A will be transformed into B
+    at least after two substitutions and one insertion:
+
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+    So the edit distance between A and B is 3.
+
+    The input is a Tensor, the input_length and label_length should be supported.
+
+    The `batch_size` of labels should be same as `input`.
+
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
+
+    Parameters:
+        input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64.
+        label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64.
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
+                                     calculating edit distance.
+        input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
+
+    Returns:
+	Tuple:
+
+        distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
+        sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64')
+            label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64')
+            input_len = paddle.to_tensor([3,3,3,3], dtype='int64')
+            label_len = paddle.to_tensor([4,4,4,4], dtype='int64')
+
+            distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False)
+
+            # print(distance)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(sequence_num)
+            # [4]
+
+    """
+    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
+    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
+    helper = LayerHelper("edit_distance", **locals())
+
+    # remove some tokens from input and labels
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens})
+        label = erased_label
+
+    this_inputs = {"Hyps": [input], "Refs": [label]}
+    if input_length is not None and label_length is not None:
+        this_inputs['HypsLength'] = [input_length]
+        this_inputs['RefsLength'] = [label_length]
+
+    # edit distance op
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out, sequence_num
+
+
+def warpctc(input,
+            label,
+            blank=0,
+            norm_by_times=False,
+            input_length=None,
+            label_length=None):
+    """
+    An operator integrating the open source Warp-CTC library
+    (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation is
+    interated to the Warp-CTC library to normalize values for each row of the
+    input tensor.
+
+    Args:
+       input (Variable): The unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
+         information. When it is a 2-D LodTensor, its shape is 
+         `[Lp, num_classes + 1]`, where `Lp` is the sum of all input
+         sequences' length and `num_classes` is the true number of classes.
+         (not including the blank label). When it is a 3-D Tensor, its shape 
+         is `[max_logit_length, batch_size, num_classes + 1]`,
+         where `max_logit_length` is the longest length of
+         input logit sequence. The data type should be float32 or float64.
+       label (Variable): The ground truth of variable-length sequence,
+         which must be a 2-D Tensor with LoD information or a 3-D Tensor without
+         LoD information, needs to be consistent with the coressponding input. 
+         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum 
+         of all labels' length. When it is a 3-D Tensor, its shape is 
+         `[batch_size, max_label_length]`, where `max_label_length` is the longest
+         length of label sequence. Data type must be int32.
+       blank (int, default 0): The blank label index of Connectionist
+         Temporal Classification (CTC) loss, which is in the
+         half-opened interval `[0, num_classes + 1)`. The data type must be int32. 
+       norm_by_times(bool, default false): Whether to normalize the gradients
+         by the number of time-step, which is also the sequence's length.
+         There is no need to normalize the gradients if warpctc layer was
+         followed by a mean_op.
+       input_length(Variable): The length for each input sequence if it is 
+         of Tensor type, it should have shape `[batch_size]` and dtype int64.
+       label_length(Variable): The length for each label sequence if it is
+         of Tensor type, it should have shape `[batch_size]` and dtype int64.
+
+    Returns:
+        Variable: The Connectionist Temporal Classification (CTC) loss,
+        which is a 2-D Tensor with the shape `[batch_size, 1]`.
+        The date type is the same as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            # using LoDTensor
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+
+            # lengths of logit sequences
+            seq_lens = [2,6]
+            # lengths of label sequences
+            label_lens = [2,3]
+            # class num
+            class_num = 5
+
+            paddle.enable_static()
+            logits = fluid.data(name='logits',shape=[None, class_num+1],
+                                 dtype='float32',lod_level=1)
+            label = fluid.data(name='label', shape=[None, 1],
+                               dtype='int32', lod_level=1)
+            cost = fluid.layers.warpctc(input=logits, label=label)
+            place = fluid.CPUPlace()
+            x = fluid.create_lod_tensor(
+                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"), 
+                     [seq_lens], place)
+            y = fluid.create_lod_tensor(
+                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"), 
+                     [label_lens], place)
+            exe = fluid.Executor(place)
+            output= exe.run(fluid.default_main_program(),
+                            feed={"logits": x,"label": y},
+                            fetch_list=[cost.name])
+            print(output)
+
+        .. code-block:: python
+
+            # using Tensor
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+
+            # length of the longest logit sequence
+            max_seq_length = 5
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 16
+            # class num
+            class_num = 5
+            paddle.enable_static()
+            logits = fluid.data(name='logits',
+                           shape=[max_seq_length, batch_size, class_num+1],
+                           dtype='float32')
+            logits_length = fluid.data(name='logits_length', shape=[None],
+                             dtype='int64')
+            label = fluid.data(name='label', shape=[batch_size, max_label_length],
+                           dtype='int32')
+            label_length = fluid.data(name='labels_length', shape=[None],
+                             dtype='int64')
+            cost = fluid.layers.warpctc(input=logits, label=label,
+                            input_length=logits_length,
+                            label_length=label_length)
+            place = fluid.CPUPlace()
+            x = np.random.rand(max_seq_length, batch_size, class_num+1).astype("float32")
+            y = np.random.randint(0, class_num, [batch_size, max_label_length]).astype("int32")
+            exe = fluid.Executor(place)
+            output= exe.run(fluid.default_main_program(),
+                            feed={"logits": x,
+                                  "label": y,
+                                  "logits_length": np.array([max_seq_length]*batch_size).astype("int64"),
+                                  "labels_length": np.array([max_label_length]*batch_size).astype("int64")},
+                                  fetch_list=[cost.name])
+            print(output)
+    """
+    if in_dygraph_mode():
+        if input_length is None or label_length is None:
+            raise ValueError(
+                "input_length and label_length must not be None in dygraph mode!"
+            )
+        grad, loss_out = _C_ops.warpctc(
+            input,
+            label,
+            input_length,
+            label_length,
+            'blank',
+            blank,
+            'norm_by_times',
+            norm_by_times, )
+        return loss_out
+    helper = LayerHelper('warpctc', **locals())
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
+    check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
+    this_inputs = {'Logits': [input], 'Label': [label]}
+    if input_length is not None and label_length is not None:
+        check_variable_and_dtype(input_length, 'LogitsLength', ['int64'],
+                                 "warpctc")
+        check_variable_and_dtype(label_length, 'LabelLength', ['int64'],
+                                 "warpctc")
+        this_inputs['LogitsLength'] = [input_length]
+        this_inputs['LabelLength'] = [label_length]
+
+    loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    helper.append_op(
+        type='warpctc',
+        inputs=this_inputs,
+        outputs={'WarpCTCGrad': [grad_out],
+                 'Loss': [loss_out]},
+        attrs={
+            'blank': blank,
+            'norm_by_times': norm_by_times,
+        })
+    return loss_out
+
+
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@static_only
+@templatedoc(op_type="nce")
+def nce(input,
+        label,
+        num_total_classes,
+        sample_weight=None,
+        param_attr=None,
+        bias_attr=None,
+        num_neg_samples=None,
+        name=None,
+        sampler="uniform",
+        custom_dist=None,
+        seed=0,
+        is_sparse=False):
+    """
+    :api_attr: Static Graph
+
+    ${comment}
+
+    Args:
+        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim], 
+            and data type is float32 or float64.
+        label (Tensor): Input label, 2-D tensor with shape [batch_size, num_true_class],
+            and data type is int64.
+        num_total_classes (int):${num_total_classes_comment}.
+        sample_weight (Tensor|None): A Tensor of shape [batch_size, 1]
+            storing a weight for each sample. The default weight for each
+            sample is 1.0.
+        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+            Default: None, which means the default weight parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
+            Default: None, which means the default bias parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        num_neg_samples (int): ${num_neg_samples_comment}.
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+        sampler (str, optional): The sampler used to sample class from negative classes.
+                       It can be 'uniform', 'log_uniform' or 'custom_dist'.
+                       default: 'uniform'.
+        custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
+                       It is used when sampler is set to 'custom_dist'.
+                       custom_dist[i] is the probability of i-th class to be sampled.
+                       default: None.
+        seed (int, optional): The seed used in sampler. Default 0, means no random seed.
+        is_sparse(bool, optional): The flag indicating whether to use sparse update, 
+            the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
+
+    Returns:
+        Tensor: The output nce loss.
+
+    Examples:
+        .. code-block:: python
+
+
+            import paddle
+            import numpy as np
+
+            paddle.enable_static()
+
+            window_size = 5
+            words = []
+            for i in range(window_size):
+                words.append(paddle.static.data(
+                    name='word_{0}'.format(i), shape=[-1, 1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in range(window_size):
+                if i == label_word:
+                    continue
+
+                emb = paddle.static.nn.embedding(input=words[i], size=[dict_size, 32],
+                                    param_attr='embed', is_sparse=True)
+                embs.append(emb)
+
+            embs = paddle.concat(x=embs, axis=1)
+            loss = paddle.static.nn.nce(input=embs, label=words[label_word],
+                        num_total_classes=dict_size, param_attr='nce.w_0',
+                        bias_attr='nce.b_0')
+
+            #or use custom distribution
+            dist = np.array([0.05,0.5,0.1,0.3,0.05])
+            loss = paddle.static.nn.nce(input=embs, label=words[label_word],
+                    num_total_classes=5, param_attr='nce.w_1',
+                    bias_attr='nce.b_1',
+                    num_neg_samples=3,
+                    sampler="custom_dist",
+                    custom_dist=dist)
+    """
+    helper = LayerHelper('nce', **locals())
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nce')
+    check_variable_and_dtype(label, 'label', ['int64'], 'nce')
+
+    dim = input.shape[1]
+    num_true_class = label.shape[1]
+    w = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_total_classes, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    inputs = {}
+    if helper.bias_attr:
+        b = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=[num_total_classes, 1],
+            is_bias=True,
+            dtype=input.dtype)
+        inputs['Bias'] = b
+    cost = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
+
+    inputs['Input'] = input
+    inputs['Label'] = label
+    inputs['Weight'] = w
+    inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
+
+    if sampler == "uniform":
+        sampler = 0
+    elif sampler == "log_uniform":
+        sampler = 1
+    elif sampler == "custom_dist":
+        assert custom_dist is not None
+
+        custom_dist_len = num_total_classes
+        alias_probs_ = [0] * custom_dist_len
+        alias_ = [0] * custom_dist_len
+        bigs = []
+        littles = []
+        for i in range(custom_dist_len):
+            normal_prob = custom_dist[i] * custom_dist_len
+            if normal_prob - 1.0 > 0:
+                bigs.append((i, normal_prob))
+            elif 1.0 - normal_prob > 0:
+                littles.append((i, normal_prob))
+            else:
+                alias_probs_[i] = normal_prob
+                alias_[i] = -1
+
+        while len(bigs) and len(littles):
+            big = bigs.pop(0)
+            little = littles.pop(0)
+
+            big_idx = big[0]
+            big_prob = big[1]
+
+            alias_probs_[little[0]] = little[1]
+            alias_[little[0]] = big_idx
+            big_left = big[1] + little[1] - 1
+            if big_left - 1.0 > 0:
+                bigs.append((big_idx, big_left))
+            elif 1.0 - big_left > 0:
+                littles.append((big_idx, big_left))
+            else:
+                alias_probs_[big_idx] = big_left
+                alias_[big_idx] = -1
+
+        if len(bigs):
+            big = bigs.pop(0)
+            alias_probs_[big[0]] = 1.0
+            alias_[big[0]] = -1
+        if len(littles):
+            little = littles.pop(0)
+            alias_probs_[little[0]] = 1.0
+            alias_[little[0]] = -1
+
+        def _init_by_numpy_array(numpy_array):
+            ret = helper.create_parameter(
+                attr=ParamAttr(),
+                shape=numpy_array.shape,
+                dtype=numpy_array.dtype,
+                default_initializer=NumpyArrayInitializer(numpy_array))
+            ret.stop_gradient = True
+            return ret
+
+        inputs['CustomDistProbs'] = _init_by_numpy_array(
+            np.array(custom_dist).astype('float32'))
+        inputs['CustomDistAlias'] = _init_by_numpy_array(
+            np.array(alias_).astype('int32'))
+        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+            np.array(alias_probs_).astype('float32'))
+        sampler = 2
+    else:
+        raise Exception("Unsupported sampler type.")
+
+    if num_neg_samples is None:
+        num_neg_samples = 10
+    else:
+        num_neg_samples = int(num_neg_samples)
+
+    remote_prefetch = is_sparse
+    print(
+        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
+    )
+
+    attrs = {
+        'num_total_classes': int(num_total_classes),
+        'num_neg_samples': num_neg_samples,
+        'seed': seed,
+        'sampler': sampler,
+        'is_sparse': is_sparse,
+        'remote_prefetch': remote_prefetch
+    }
+
+    helper.append_op(
+        type='nce',
+        inputs=inputs,
+        outputs={
+            'Cost': cost,
+            'SampleLogits': sample_logits,
+            'SampleLabels': sample_labels
+        },
+        attrs=attrs)
+    return cost / (num_neg_samples + 1)
+
+
+def hsigmoid(input,
+             label,
+             num_classes,
+             param_attr=None,
+             bias_attr=None,
+             name=None,
+             path_table=None,
+             path_code=None,
+             is_custom=False,
+             is_sparse=False):
+    """
+    :api_attr: Static Graph
+    
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
+            and D is the feature size. Its data type supports float32 and float64.
+        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
+            and data type is int64.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
+            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
+            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
+            initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
+            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
+            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
+            set, the bias is initialized zero. Default: None.
+        name (str, optional): Normally there is no need for user to set this property. For more information,
+            please refer to :ref:`api_guide_Name`. Default: None.
+        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
+            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
+            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
+            nodes' weight matrix. Default: None.
+        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
+            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
+            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, :attr:`path_table`,
+            :attr:`path_code` and :attr:`num_classes` should be set, otherwise :attr:`num_classes` should
+            be set. Default: False.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
+            gradient of W and input will be sparse. Default: False.
+
+    Returns:
+        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            x = fluid.layers.fill_constant(shape=[4, 3], value=0.9, dtype='float32')
+            # x = [[0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9]]
+            y = fluid.layers.fill_constant(
+                shape=[4, 1], value=1, dtype='int64')
+            # y = [[1], [1], [1], [1]]
+            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2, param_attr=fluid.initializer.Constant(
+                value=0.05), bias_attr=fluid.initializer.Constant(value=.0))
+            # out = [[0.62792355], [0.62792355], [0.62792355], [0.62792355]]
+    """
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'hsigmoid')
+    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid')
+
+    helper = LayerHelper('hierarchical_sigmoid', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    pre_out = helper.create_variable_for_type_inference(dtype)
+    dim = input.shape[1]
+    if ((num_classes is None) or (num_classes < 2)) and (not is_custom):
+        raise ValueError(
+            "num_classes must not be less than 2 with default tree")
+
+    if (not is_custom) and (is_sparse):
+        print("Sparse mode should not be used without custom tree")
+        is_sparse = False
+
+    if (not is_custom) and ((path_table is not None) or
+                            (path_code is not None)):
+        raise ValueError(
+            "only num_classes should be passed without custom tree")
+
+    if (is_custom) and (path_code is None):
+        raise ValueError("path_code should not be None with custom tree")
+    elif (is_custom) and (path_table is None):
+        raise ValueError("path_table should not be None with custom tree")
+    elif (is_custom) and (num_classes is None):
+        raise ValueError("num_classes should not be None with custom tree")
+    else:
+        pass
+
+    weights = None
+    remote_prefetch = is_sparse
+    print(
+        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
+    )
+    if not is_custom:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[num_classes - 1, dim],
+            is_bias=False,
+            dtype=input.dtype)
+    else:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[num_classes, dim],
+            is_bias=False,
+            dtype=input.dtype)
+    inputs = {
+        "X": input,
+        "W": weights,
+        "PathTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+    if helper.bias_attr:
+        if not is_custom:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[num_classes - 1, 1],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
+        else:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[num_classes, 1],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs={"Out": out,
+                 "PreOut": pre_out,
+                 "W_Out": weights},
+        attrs={
+            "num_classes": num_classes,
+            "is_sparse": is_sparse,
+            "remote_prefetch": remote_prefetch
+        })
+    return out
+
+
+def sampled_softmax_with_cross_entropy(logits,
+                                       label,
+                                       num_samples,
+                                       num_true=1,
+                                       remove_accidental_hits=True,
+                                       use_customized_samples=False,
+                                       customized_samples=None,
+                                       customized_probabilities=None,
+                                       seed=0):
+    """
+    **Sampled Softmax With Cross Entropy Operator.**
+
+    Cross entropy loss with sampled softmax is used as the output layer for 
+    larger output classes extensively. This operator samples a number of samples
+    for all examples, and computes the softmax normalized values for each 
+    row of the sampled tensor, after which cross-entropy loss is computed. 
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+    
+    For examples with T true labels (T >= 1), we assume that each true label has
+    a probability of 1/T. For each sample, S samples are generated using a
+    log uniform distribution. True labels are concatenated with these samples to
+    form T + S samples for each example. So, assume the shape of logits is
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
+    probability is calculated, which corresponds to the Q(y|x) in 
+    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
+    
+    Logits are sampled according to the sampled labels. Then if 
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+    make its softmax result close to zero. Then sampled logits are subtracted by
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    a softmax with cross entropy.
+
+    Args:
+        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth which is a 2-D tensor. Label is a 
+            Tensor<int64> with shape [N x T], where T is the number of true 
+            labels per example. 
+        num_samples (int): The number for each example, num_samples should be 
+            less than the number of class.
+        num_true(int): The number of target classes per training example.
+        remove_accidental_hits (bool): A flag indicating whether to remove 
+            accidental hits when sampling. If True and if a sample[i, j] 
+            accidentally hits true labels, then the corresponding 
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+            close to zero. Default is True.
+        use_customized_samples (bool): Whether to use custom samples and probabities to sample
+            logits.
+        customized_samples (Variable): User defined samples, which is a 2-D tensor
+            with shape [N, T + S]. S is the num_samples, and T is the number of true 
+            labels per example. 
+        customized_probabilities (Variable): User defined probabilities of samples, 
+            a 2-D tensor which has the same shape with customized_samples.
+        seed (int): The random seed for generating random number, which is used
+            in the process of sampling. Default is 0.
+
+    Returns:
+        Variable: Return the cross entropy loss which is a 2-D tensor with shape
+                  [N x 1].
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            input = fluid.layers.data(name='data', shape=[256], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            fc = fluid.layers.fc(input=input, size=100)
+            out = fluid.layers.sampled_softmax_with_cross_entropy(
+                      logits=fc, label=label, num_samples=25)
+    """
+    helper = LayerHelper('sample_logits', **locals())
+    samples = customized_samples if use_customized_samples else helper.create_variable_for_type_inference(
+        dtype='int64')
+    probabilities = customized_probabilities if use_customized_samples else helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+    sampled_logits \
+        = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
+    sampled_softlabel = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+    logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
+
+    helper.append_op(
+        type='sample_logits',
+        inputs={
+            'Logits': logits,
+            'Labels': label,
+            'CustomizedSamples': customized_samples,
+            'CustomizedProbabilities': customized_probabilities
+        },
+        outputs={
+            'Samples': samples,
+            'Probabilities': probabilities,
+            'SampledLabels': sampled_label,
+            'SampledLogits': sampled_logits,
+            'LogitsDim': logits_dim,
+            'LabelsDim': labels_dim
+        },
+        attrs={
+            'use_customized_samples': use_customized_samples,
+            'uniq': True,
+            'remove_accidental_hits': remove_accidental_hits,
+            'num_samples': num_samples,
+            'seed': seed
+        })
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(
+        type='one_hot',
+        inputs={'X': sampled_label},
+        attrs={'depth': num_samples + 1},
+        outputs={'Out': sampled_softlabel})
+
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': sampled_logits,
+                'Label': sampled_softlabel},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={
+            'soft_label': True,
+            'ignore_index': False,
+            'numeric_stable_mode': False
+        })
+    return loss / num_true
+
+
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=kIgnoreIndex,
+                               numeric_stable_mode=True,
+                               return_softmax=False,
+                               axis=-1):
+    r"""
+
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
+
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
+
+    .. math::
+
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+
+    2) Soft label (each sample can have a distribution over all classes)
+
+    .. math::
+
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated first by:
+
+    .. math::
+
+        max_j &= \\max_{i=0}^{K}{\\text{logits}_i}
+
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logits_i - max_j)
+
+        softmax_j &= \\exp(logits_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
+    Args:
+        logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
+        label (Tensor): The ground truth  ``Tensor`` , data type is the same
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
+            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
+            labels as soft labels. Default False.
+        ignore_index (int, optional): Specifies a target value that is ignored and does
+                                      not contribute to the input gradient. Only valid
+                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      Default: kIgnoreIndex(-100).
+        numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
+                                              numerically stable algorithm. Only valid
+                                              when :attr:`soft_label` is :attr:`False` 
+                                              and GPU is used. When :attr:`soft_label` 
+                                              is :attr:`True` or CPU is used, the 
+                                              algorithm is always numerically stable.
+                                              Note that the speed may be slower when use
+                                              stable algorithm. Default: True.
+        return_softmax (bool, optional): A flag indicating whether to return the softmax
+                                         along with the cross entropy loss. Default: False.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
+    Returns:
+        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+                                                    `return_softmax` is False, otherwise the tuple \
+                                                    (loss, softmax), softmax is in the same shape \
+                                                    with input logits and cross entropy loss is in \
+                                                    the same shape with input logits except shape \
+                                                    in dimension :attr:`axis` as 1.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            data = np.random.rand(128).astype("float32")
+            label = np.random.rand(1).astype("int64")
+            data = paddle.to_tensor(data)
+            label = paddle.to_tensor(label)
+            linear = paddle.nn.Linear(128, 100)
+            x = linear(data)
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+            print(out)
+    """
+    if in_dygraph_mode():
+        if core.is_compiled_with_npu():
+            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
+                logits, label, 'soft_label', soft_label, 'ignore_index',
+                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                'axis', axis)
+        else:
+            softmax, loss = _C_ops.softmax_with_cross_entropy(
+                logits, label, 'soft_label', soft_label, 'ignore_index',
+                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                'axis', axis)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': numeric_stable_mode,
+        'axis': axis
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+
+    outputs = {'Softmax': softmax, 'Loss': loss}
+    if core.is_compiled_with_npu():
+        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        outputs['Backprop'] = backprop
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs=outputs,
+        attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def rank_loss(label, left, right, name=None):
+    r"""
+
+    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
+    with a training sample consisting of a pair of documents (A and B), The label (P) 
+    indicates whether A is ranked higher than B or not. Please refer to more details: 
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
+
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
+    for documents A and B and the value of label P. Rank loss layer takes batch inputs 
+    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1}, 
+    where 0.5 means that there is no information about the rank of the input pair.
+    The following equation computes rank loss C_{i,j} from the inputs:
+
+    .. math::
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
+    .. math::
+      o_{i,j} &=  o_i - o_j  \\\\
+    .. math::
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
+
+    Parameters:
+        label (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32, batch indicates the size of the data. Indicats whether A ranked higher than B or not.
+        left (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc A.
+        right (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc B.
+        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: ``Tensor`` indicating the output value of the sort loss layer, the data type is float32, and the return value's shape is :math:`[batch,1]` .
+
+    Raises:
+        ValueError: Any of label, left, and right is not a ``Variable`` .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
+            left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
+            right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
+            out = fluid.layers.rank_loss(label, left, right)
+
+    """
+    helper = LayerHelper('rank_loss', **locals())
+    check_variable_and_dtype(label, 'label', ['float32'], "rank_loss")
+    check_variable_and_dtype(left, 'left', ['float32'], "rank_loss")
+    check_variable_and_dtype(right, 'right', ['float32'], "rank_loss")
+
+    out = helper.create_variable_for_type_inference("float32")
+
+    helper.append_op(
+        type='rank_loss',
+        inputs={"Label": label,
+                "Left": left,
+                "Right": right},
+        outputs={'Out': out})
+    return out
+
+
+def margin_rank_loss(label, left, right, margin=0.1, name=None):
+    r"""
+    Margin Ranking Loss Layer for ranking problem,
+    which compares left score and right score passed in.
+    The ranking loss can be defined as following equation:
+
+    .. math::
+
+        rank\_loss = max(0, -label * (left - right) + margin)
+
+    Args:
+       label (Variable): Indicates whether the left is ranked higher than the right or not.
+           Data type is float32.
+       left (Variable): Ranking score for left. Data type float32.
+       right (Variable): Ranking score for right. Data type float32.
+       margin (float): Indicates the given margin.
+       name(str|None): For detailed information, please refer to 
+           :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+
+    Returns:
+       Variable: The ranking loss.
+
+    Raises:
+       ValueError: Any of label, left, and right is not a Variable.
+
+    Examples:
+
+        .. code-block:: python
+
+           import paddle.fluid as fluid
+           label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
+           left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
+           right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
+           out = fluid.layers.margin_rank_loss(label, left, right)
+    """
+    helper = LayerHelper('margin_rank_loss', **locals())
+    check_variable_and_dtype(label, 'label', ['float32'], 'margin_rank_loss')
+    check_variable_and_dtype(label, 'left', ['float32'], 'margin_rank_loss')
+    check_variable_and_dtype(label, 'right', ['float32'], 'margin_rank_loss')
+    out = helper.create_variable_for_type_inference(left.dtype)
+    act = helper.create_variable_for_type_inference(left.dtype)
+    helper.append_op(
+        type='margin_rank_loss',
+        inputs={"Label": label,
+                "X1": left,
+                "X2": right},
+        outputs={'Out': out,
+                 'Activated': act},
+        attrs={'margin': margin})
+    return out
+
+
+@templatedoc()
+def sigmoid_cross_entropy_with_logits(x,
+                                      label,
+                                      ignore_index=kIgnoreIndex,
+                                      name=None,
+                                      normalize=False):
+    """
+
+    ${comment}
+
+    Args:
+        x(Tensor): a 2-D tensor with shape N x D, where N is the batch size and
+                D is the number of classes. This input is a tensor of logits computed
+                by the previous operator. Logits are unscaled log probabilities given
+                as log(p/(1-p)) The data type should be float32 or float64.
+        label (Tensor): a 2-D tensor of the same type and shape as X.
+                This input is a tensor of probabalistic labels for each logit.
+        ignore_index(int): Specifies a target value that is ignored and 
+                does not contribute to the input gradient.
+        name(str|None): The default value is None.  Normally there is
+            no need for user to set this property.  For more information,
+            please refer to :ref:`api_guide_Name`
+        normalize(bool): If true, divide the output by the number of
+            targets != ignore_index.
+
+    Returns:
+        out(Tensor): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+
+            import paddle
+
+            input = paddle.rand(shape=[10], dtype='float32')
+            label = paddle.rand(shape=[10], dtype='float32')
+            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label, 
+                                                            ignore_index=-1, normalize=True)
+            print(loss)
+    """
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'sigmoid_cross_entropy_with_logits')
+
+    helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type="sigmoid_cross_entropy_with_logits",
+        inputs={"X": x,
+                "Label": label},
+        attrs={"ignore_index": ignore_index,
+               'normalize': normalize},
+        outputs={"Out": out})
+    return out
+
+
+def teacher_student_sigmoid_loss(input,
+                                 label,
+                                 soft_max_up_bound=15.0,
+                                 soft_max_lower_bound=-15.0):
+    """
+
+    **Teacher Student Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    teacher_student loss. Z is click or not, z' is value of teacher loss, label = {-2, -1, [0, 2]}
+    when z' is not exist, clk = 0 : label = -2; when z' is not exist, clk = 1 : label = -1;
+    when z' is exist    , clk = 0 : label = 0 + z'; when z' is exist    , clk = 1 : label = 1 + z'
+
+    .. math::
+        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
+        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
+
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
+
+    Examples:
+        .. code-block:: python
+          
+          import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
+          batch_size = 64
+          label = fluid.data(
+                    name="label", shape=[batch_size, 1], dtype="int64")
+          similarity = fluid.data(
+                    name="similarity", shape=[batch_size, 1], dtype="float32")
+          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
+
+    """
+    check_variable_and_dtype(input, "input",
+                             ['float32', 'float64', 'int32', 'int64'],
+                             'teacher_student_sigmoid_loss')
+    check_variable_and_dtype(label, "label",
+                             ['float32', 'float64', 'int32', 'int64'],
+                             'teacher_student_sigmoid_loss')
+
+    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    helper.append_op(
+        type='teacher_student_sigmoid_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
+                "soft_max_up_bound": float(soft_max_up_bound)})
+    return out
+
+
+def huber_loss(input, label, delta):
+    r"""
+    This operator computes the Huber loss between input and label.
+    Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers.
+
+    When the absolute difference between input and label is greater than delta, the linear error is calculated:
+
+    .. math::
+            huber\_loss = delta * (label - input) - 0.5 * delta * delta
+
+    When the absolute difference between input and label is greater than delta, the square error is calculated:
+
+    .. math::
+            huber\_loss = 0.5 * (label - input) * (label - input)
+
+
+    Args:
+        input (Variable): Predicted data, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32.
+        label (Variable): Ground truth label, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32.
+        delta (float): The threshold for Huber loss, which is used to control the balance between the linear error and square error. The data type should be float32.
+
+    Returns:
+        Variable: The huber loss, a tensor with the same shape and data type as input.
+
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle.fluid as fluid
+        import numpy as np
+
+        DATATYPE='float32'
+        input_data = np.array([[1.],[2.],[3.],[4.]]).astype(DATATYPE)
+        label_data = np.array([[3.],[3.],[4.],[4.]]).astype(DATATYPE)
+
+        x = fluid.data(name='input', shape=[None, 1], dtype=DATATYPE)
+        y = fluid.data(name='label', shape=[None, 1], dtype=DATATYPE)
+        loss = fluid.layers.huber_loss(input=x, label=y, delta=1.0)
+
+        place = fluid.CPUPlace()
+        #place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        HuberLoss, = exe.run(feed={'input':input_data ,'label':label_data}, fetch_list=[loss.name])
+        print(HuberLoss)  #[[1.5], [0.5], [0.5], [0. ]], dtype=float32
+    """
+    helper = LayerHelper('huber_loss', **locals())
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'huber_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'huber_loss')
+    residual = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='huber_loss',
+        inputs={'X': input,
+                'Y': label},
+        outputs={'Out': out,
+                 'Residual': residual},
+        attrs={'delta': delta})
+    return out
+
+
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div")
+@templatedoc()
+def kldiv_loss(x, target, reduction='mean', name=None):
+    """
+
+    ${comment}
+
+    Args:
+        x (Tensor): ${x_comment}
+        target (Tensor): ${target_comment}
+        reduction (Tensor): ${reduction_comment}
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The KL divergence loss. The data type is same as input tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            
+            x = paddle.rand(shape=[3,4,2,2], dtype='float32')
+            target = paddle.rand(shape=[3,4,2,2], dtype='float32')
+
+            # 'batchmean' reduction, loss shape will be [1]
+            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
+            print(loss.shape) # shape=[1]
+            
+            # 'mean' reduction, loss shape will be [1]
+            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='mean')
+            print(loss.shape) # shape=[1]
+            
+            # 'sum' reduction, loss shape will be [1]
+            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='sum')
+            print(loss.shape) # shape=[1]
+            
+            # 'none' reduction, loss shape is same with X shape
+            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='none')
+            print(loss.shape) # shape=[3, 4, 2, 2]
+
+    """
+    helper = LayerHelper('kldiv_loss', **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'kldiv_loss')
+    check_variable_and_dtype(target, 'target', ['float32', 'float64'],
+                             'kldiv_loss')
+    check_type(reduction, 'reduction', str, 'kldiv_loss')
+    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': x,
+                'Target': target},
+        outputs={'Loss': loss},
+        attrs={'reduction': reduction})
+    return loss
+
+
+from .ops import square
+from .control_flow import equal
+
+
+def npair_loss(anchor, positive, labels, l2_reg=0.002):
+    """ 
+  
+    Npair loss requires paired data. Npair loss has two parts: the first part is L2
+    regularizer on the embedding vector; the second part is cross entropy loss which
+    takes the similarity matrix of anchor and positive as logits.
+  
+    For more information, please refer to:
+    `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
+  
+    Args:
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+      l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
+
+  
+    Returns:
+      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
+  
+    Examples:
+
+      .. code-block:: python
+  
+          import paddle
+          
+          DATATYPE = "float32"
+  
+          anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+          
+          npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+          print(npair_loss)
+  
+    """
+    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
+                             'npair_loss')
+    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                             'positive')
+    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
+                             'labels')
+    Beta = 0.25
+    batch_size = labels.shape[0]
+
+    labels = nn.reshape(labels, shape=[batch_size, 1])
+    labels = paddle.tile(labels, repeat_times=[1, batch_size])
+
+    labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
+    labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
+
+    l2loss = nn.reduce_mean(nn.reduce_sum(square(anchor), 1)) \
+             + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
+    l2loss = l2loss * Beta * l2_reg
+
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True)
+    softmax_ce = softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True)
+    cross_entropy = nn.reduce_sum(labels * softmax_ce, 0)
+    celoss = nn.reduce_mean(cross_entropy)
+
+    return l2loss + celoss
+
+
+def mse_loss(input, label):
+    """
+
+    This op accepts input predications and target label and returns the mean square error.
+
+    The loss can be described as:
+
+    .. math::
+        
+        Out = MEAN((input - label)^2)
+
+    Parameters: 
+        input (Tensor): Input tensor, the data type should be float32.
+        label (Tensor): Label tensor, the data type should be float32.
+
+    Returns:
+        Tensor: The tensor storing the mean square error difference of input and label.
+
+    Return type: Tensor.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            input = paddle.to_tensor([1.1, 1.9])
+            label = paddle.to_tensor([1.0, 2.0])
+            output = paddle.fluid.layers.mse_loss(input, label)
+            print(output.numpy())
+            # [0.01]
+    """
+    check_variable_and_dtype(input, "input", ['float32', 'float64'], 'mse_loss')
+    check_variable_and_dtype(label, "label", ['float32', 'float64'], 'mse_loss')
+    return nn.reduce_mean(square_error_cost(input, label))

From b83a1e21846b2bcffd455397c45ed0c29928b972 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 20:17:02 +0800
Subject: [PATCH 09/23] fix Examples bug in English doc

---
 python/paddle/nn/functional/loss.py | 24 ++++++++++++++++--------
 python/paddle/nn/layer/loss.py      | 19 +++++++++++++------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f8e1ab447fcb2..dfd2c86f841a5 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2113,16 +2113,24 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         .. code-block:: python
 
             import paddle
-            import bumpy as np
-            import paddle.nn.functional as F
+            import numpy as np
+            import paddle.nn as nn
+
+            input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
+            # label elements in {1., -1.}
+            label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
+
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='none')
+            loss = hinge_embedding_loss(input, label)
+            print(loss)
+            # Tensor([[0., -2., 0.],
+            #         [0., -1., 2.],
+            #         [1., 1., 1.]])
 
-            input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-            # get label with elements in {1., -1.}
-            label_np = 2 * np.random.randint(0, 2, size=(10, 10, 5)) - 1.
-            input = paddle.to_tensor(input_np)
-            label = paddle.to_tensor(label_np, dtype=paddle.float32)
-            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='mean')
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='mean')
+            loss = hinge_embedding_loss(input, label)
             print(loss)
+            # Tensor([0.22222222])
     """
 
     if reduction not in ['sum', 'mean', 'none']:
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ae6383a392c86..b1177967723ae 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1267,17 +1267,24 @@ class HingeEmbeddingLoss(Layer):
         .. code-block:: python
 
             import paddle
-            import bumpy as np
+            import numpy as np
             import paddle.nn as nn
 
-            input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-            # get label with elements in {1., -1.}
-            label_np = 2 * np.random.randint(0, 2, size=(10, 10, 5)) - 1.
-            input = paddle.to_tensor(input_np)
-            label = paddle.to_tensor(label_np, dtype=paddle.float32)
+            input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
+            # label elements in {1., -1.}
+            label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
+
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='none')
+            loss = hinge_embedding_loss(input, label)
+            print(loss)
+            # Tensor([[0., -2., 0.],
+            #         [0., -1., 2.],
+            #         [1., 1., 1.]])
+
             hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='mean')
             loss = hinge_embedding_loss(input, label)
             print(loss)
+            # Tensor([0.22222222])
     """
 
     def __init__(self, delta=1.0, reduction="mean", name=None):

From 2988c7641692667dfe6be0a675b817214b14bd2f Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 25 Nov 2021 22:22:45 +0800
Subject: [PATCH 10/23] unique -> flatten

---
 python/paddle/nn/functional/loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index dfd2c86f841a5..dce4b71317c44 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2144,14 +2144,14 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    if set(label.unique().numpy()) <= {1., -1.}:
+    if set(label.flatten().numpy()) <= {1., -1.}:
         loss = paddle.where(
             label == 1., input,
             paddle.maximum(paddle.to_tensor(0.), delta - input))
     else:
         raise ValueError("'label' should contain 1. or -1., "
                          "but received label containing {}.".format(
-                             label.unique().numpy()))
+                             label.flatten().numpy()))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From 04cf9857907d4ab5d92f065edd9e9d22173646e7 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Fri, 26 Nov 2021 09:34:45 +0800
Subject: [PATCH 11/23] fix api code

---
 python/paddle/nn/functional/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index dce4b71317c44..5dd23902c804f 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2151,7 +2151,7 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
     else:
         raise ValueError("'label' should contain 1. or -1., "
                          "but received label containing {}.".format(
-                             label.flatten().numpy()))
+                             set(label.flatten().numpy())))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From a3bfd3e4362d0f901dc1ae8502f311af54cef60d Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Fri, 26 Nov 2021 14:54:30 +0800
Subject: [PATCH 12/23] fix English doc

---
 python/paddle/nn/functional/loss.py | 34 ++++++++++++++---------------
 python/paddle/nn/layer/loss.py      | 32 +++++++++++++--------------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 5dd23902c804f..a2e26c913fcde 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2055,11 +2055,9 @@ def sigmoid_focal_loss(logit,
 
 def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input
-    tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
-    This is usually used for measuring whether two inputs are similar or
-    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
-    used for learning nonlinear embeddings or semi-supervised learning.
+    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
+    and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
     The loss function for :math:`n`-th sample in the mini-batch is
 
@@ -2080,11 +2078,7 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
     where :math:`L = \{l_1,\dots,l_N\}^\top`.
 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
-            (N, C), where C is number of classes, and if shape is more than 2D, this
-            is (N, C, D1, D2,..., Dk), k >= 1.
-        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
-            The shape of labelis the same as the shape of input.
+
         delta (float, optional): Specifies the hyperparameter delta to be used.
             The value determines how large the input need to be to calculate in
             hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
@@ -2095,25 +2089,31 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default: ``'mean'``
-        name (str, optional): Name for the operation (optional, default is
-            None). For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+
+        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64. The shape of label is the same as the shape of input.
 
     Shape:
-        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions,
-            available dtype is float32, float64.. The sum operationoperates over all the elements.
+
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64. The sum operationoperates over all the elements.
+
         label: N-D Tensor, same shape as the input.
+
         output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
 
     Returns:
-        The tensor variable storing the hinge_embedding_loss of input and label.
 
-    Return type: Tensor.
+        Tensor, The tensor variable storing the hinge_embedding_loss of input and label.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn as nn
 
             input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index b1177967723ae..c9330ec61ef25 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1207,11 +1207,9 @@ def forward(self, input, label):
 
 class HingeEmbeddingLoss(Layer):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input
-    tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
-    This is usually used for measuring whether two inputs are similar or
-    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
-    used for learning nonlinear embeddings or semi-supervised learning.
+    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
+    and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
     The loss function for :math:`n`-th sample in the mini-batch is
 
@@ -1232,6 +1230,7 @@ class HingeEmbeddingLoss(Layer):
     where :math:`L = \{l_1,\dots,l_N\}^\top`.
 
     Parameters:
+
         delta (float, optional): Specifies the hyperparameter delta to be used.
             The value determines how large the input need to be to calculate in
             hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
@@ -1242,32 +1241,31 @@ class HingeEmbeddingLoss(Layer):
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default: ``'mean'``
-        name (str, optional): Name for the operation (optional, default is
-            None). For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Call Parameters:
-        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
-            (N, C), where C is number of classes, and if shape is more than 2D, this
-            is (N, C, D1, D2,..., Dk), k >= 1.
-        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
-            The shape of labelis the same as the shape of input.
+
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+
+        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64. The shape of label is the same as the shape of input.
 
     Shape:
-        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions,
-            available dtype is float32, float64.. The sum operationoperates over all the elements.
+
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64. The sum operationoperates over all the elements.
+
         label: N-D Tensor, same shape as the input.
+
         output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
 
     Returns:
-        The tensor variable storing the hinge_embedding_loss of input and label.
 
-    Return type: Tensor.
+        Tensor, The tensor variable storing the hinge_embedding_loss of input and label.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn as nn
 
             input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)

From 562aa9cdd3c118b837c17923a96c11fcce72c117 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Fri, 26 Nov 2021 15:21:27 +0800
Subject: [PATCH 13/23] fix functional loss English doc

---
 python/paddle/nn/functional/loss.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index a2e26c913fcde..3a0daabd08f19 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2078,12 +2078,15 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
     where :math:`L = \{l_1,\dots,l_N\}^\top`.
 
     Parameters:
-
+        input (Tensor): Input tensor, the data type is float32 or float64.
+            the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
+            The shape of label is the same as the shape of input.
         delta (float, optional): Specifies the hyperparameter delta to be used.
             The value determines how large the input need to be to calculate in
             hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
             Default = 1.0
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size.
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
@@ -2092,23 +2095,16 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
-    Call Parameters:
-
-        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
-
-        label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64. The shape of label is the same as the shape of input.
-
     Shape:
 
         input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64. The sum operationoperates over all the elements.
 
-        label: N-D Tensor, same shape as the input.
+        label: N-D Tensor, same shape as the input. tensor elements should containing 1 or -1, the data type is float32 or float64.
 
         output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
 
     Returns:
-
-        Tensor, The tensor variable storing the hinge_embedding_loss of input and label.
+        Tensor. The tensor variable storing the hinge_embedding_loss of input and label.
 
     Examples:
         .. code-block:: python

From c658354a1987fbe2e2065844dbaa612bdfdb89b3 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Fri, 26 Nov 2021 15:26:22 +0800
Subject: [PATCH 14/23] fix Example doc

---
 python/paddle/nn/functional/loss.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 3a0daabd08f19..11e622ef61e0c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2110,21 +2110,19 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         .. code-block:: python
 
             import paddle
-            import paddle.nn as nn
+            import paddle.nn.functional as F
 
             input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
             # label elements in {1., -1.}
             label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
 
-            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='none')
-            loss = hinge_embedding_loss(input, label)
+            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='none')
             print(loss)
             # Tensor([[0., -2., 0.],
             #         [0., -1., 2.],
             #         [1., 1., 1.]])
 
-            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='mean')
-            loss = hinge_embedding_loss(input, label)
+            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='mean')
             print(loss)
             # Tensor([0.22222222])
     """

From b47d4119ddf9125fa3b30ed24615ff49d3af4bcc Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 2 Dec 2021 17:10:32 +0800
Subject: [PATCH 15/23] .numpy() -> paddle.unique()

---
 python/paddle/nn/functional/loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 11e622ef61e0c..ad2f806696d25 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,14 +2138,14 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    if set(label.flatten().numpy()) <= {1., -1.}:
+    if {i.item() for i in paddle.unique(label)} <= {-1., 1.}:
         loss = paddle.where(
             label == 1., input,
             paddle.maximum(paddle.to_tensor(0.), delta - input))
     else:
         raise ValueError("'label' should contain 1. or -1., "
                          "but received label containing {}.".format(
-                             set(label.flatten().numpy())))
+                             paddle.unique(label)))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From ebbe89e02e125402057f70d8281f82b50ef5ba02 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Thu, 2 Dec 2021 20:07:44 +0800
Subject: [PATCH 16/23] fix unique

---
 python/paddle/nn/functional/loss.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ad2f806696d25..0eb116498ac8c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,14 +2138,16 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    if {i.item() for i in paddle.unique(label)} <= {-1., 1.}:
+    label_item_set = {i.item() for i in label.flatten()}
+
+    if label_item_set <= {-1., 1.}:
         loss = paddle.where(
             label == 1., input,
             paddle.maximum(paddle.to_tensor(0.), delta - input))
     else:
         raise ValueError("'label' should contain 1. or -1., "
                          "but received label containing {}.".format(
-                             paddle.unique(label)))
+                             label_item_set))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From aa9f9c667e8c88f9f3ab114465473c2173d86491 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Sun, 5 Dec 2021 09:38:16 +0800
Subject: [PATCH 17/23] fix label_item_set

---
 python/paddle/nn/functional/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 0eb116498ac8c..be71d561dcea9 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,7 +2138,7 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    label_item_set = {i.item() for i in label.flatten()}
+    label_item_set = {i.item() for i in label.cpu().flatten()}
 
     if label_item_set <= {-1., 1.}:
         loss = paddle.where(

From c0b31b3b7d64c3831f4939d232fb897fae9a68e0 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Sun, 5 Dec 2021 10:46:42 +0800
Subject: [PATCH 18/23] modified judgment equation

---
 .../unittests/test_hinge_embedding_loss.py    | 47 ++++++-------------
 python/paddle/nn/functional/loss.py           | 13 ++---
 2 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index 0e1a8cff750ea..1e982f9890daf 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -36,24 +36,24 @@ def run_dynamic_check(self):
         label = paddle.to_tensor(self.label_np, dtype=paddle.float32)
         dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
         expected = np.mean(
-            np.where(label.numpy() == 1.,
-                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
+            np.where(label.numpy() == -1.,
+                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='sum')
         expected = np.sum(
-            np.where(label.numpy() == 1.,
-                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
+            np.where(label.numpy() == -1.,
+                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='none')
-        expected = np.where(label.numpy() == 1.,
-                            input.numpy(),
-                            np.maximum(0., self.delta - input.numpy()))
+        expected = np.where(label.numpy() == -1.,
+                            np.maximum(0., self.delta - input.numpy()),
+                            input.numpy())
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
@@ -76,15 +76,6 @@ def test_value_error():
 
         self.assertRaises(ValueError, test_value_error)
 
-    def test_label_errors(self):
-        paddle.disable_static()
-
-        def test_value_error():
-            loss = paddle.nn.functional.hinge_embedding_loss(
-                paddle.to_tensor(self.input_np), self.wrong_label)
-
-        self.assertRaises(ValueError, test_value_error)
-
 
 class TestClassHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
@@ -102,8 +93,8 @@ def run_dynamic_check(self):
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
         dy_result = hinge_embedding_loss(input, label)
         expected = np.mean(
-            np.where(label.numpy() == 1.,
-                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
+            np.where(label.numpy() == -1.,
+                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
@@ -111,17 +102,17 @@ def run_dynamic_check(self):
             reduction='sum')
         dy_result = hinge_embedding_loss(input, label)
         expected = np.sum(
-            np.where(label.numpy() == 1.,
-                     input.numpy(), np.maximum(0., self.delta - input.numpy())))
+            np.where(label.numpy() == -1.,
+                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='none')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.where(label.numpy() == 1.,
-                            input.numpy(),
-                            np.maximum(0., self.delta - input.numpy()))
+        expected = np.where(label.numpy() == -1.,
+                            np.maximum(0., self.delta - input.numpy()),
+                            input.numpy())
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
@@ -145,16 +136,6 @@ def test_value_error():
 
         self.assertRaises(ValueError, test_value_error)
 
-    def test_label_errors(self):
-        paddle.disable_static()
-
-        def test_value_error():
-            hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
-            loss = hinge_embedding_loss(
-                paddle.to_tensor(self.input_np), self.wrong_label)
-
-        self.assertRaises(ValueError, test_value_error)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index be71d561dcea9..1176b760a8f58 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,16 +2138,9 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    label_item_set = {i.item() for i in label.cpu().flatten()}
-
-    if label_item_set <= {-1., 1.}:
-        loss = paddle.where(
-            label == 1., input,
-            paddle.maximum(paddle.to_tensor(0.), delta - input))
-    else:
-        raise ValueError("'label' should contain 1. or -1., "
-                         "but received label containing {}.".format(
-                             label_item_set))
+    loss = paddle.where(label == -1.,
+                        paddle.maximum(paddle.to_tensor(0.), delta - input),
+                        input)
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From f7c49b789aa41afbfaae1600c9baf8935e704e30 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Mon, 6 Dec 2021 20:49:28 +0800
Subject: [PATCH 19/23] Got a beautiful loss equation

---
 python/paddle/nn/functional/loss.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 1176b760a8f58..c5e68a2bfc803 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,9 +2138,9 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    loss = paddle.where(label == -1.,
-                        paddle.maximum(paddle.to_tensor(0.), delta - input),
-                        input)
+    zero = fluid.dygraph.base.to_variable([0.], dtype=paddle.float32)
+    loss = paddle.where(label == 1, input, zero) + \
+           paddle.where(label == -1, delta - input, zero)
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From 3e94ec46333d160e317f340be514057634f84d77 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Tue, 7 Dec 2021 10:51:51 +0800
Subject: [PATCH 20/23] use paddle.to_tensor

---
 python/paddle/nn/functional/loss.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c5e68a2bfc803..2f7cf5b760b48 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,9 +2138,8 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    zero = fluid.dygraph.base.to_variable([0.], dtype=paddle.float32)
-    loss = paddle.where(label == 1, input, zero) + \
-           paddle.where(label == -1, delta - input, zero)
+    loss = paddle.where(label == 1., input, paddle.to_tensor(0.)) + \
+           paddle.where(label == -1., delta - input, paddle.to_tensor(0.))
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From b6137660268e0ddcdb2db9463c9e7006060eff10 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Tue, 7 Dec 2021 16:50:24 +0800
Subject: [PATCH 21/23] fix loss and add static check

---
 .../unittests/test_hinge_embedding_loss.py    | 112 ++++++++++++------
 python/paddle/nn/functional/loss.py           |   5 +-
 2 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index 1e982f9890daf..fe2a5b19047f6 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -17,56 +17,84 @@
 import paddle
 import numpy as np
 import unittest
+from paddle.static import Program, program_guard
 
 np.random.seed(42)
 
 
+def calc_hinge_embedding_loss(input, label, delta=1.0, reduction='mean'):
+    result = np.where(label == -1., np.maximum(0., delta - input), 0.) + \
+             np.where(label == 1., input, 0.)
+    if reduction == 'none':
+        return result
+    elif reduction == 'sum':
+        return np.sum(result)
+    elif reduction == 'mean':
+        return np.mean(result)
+
+
 class TestFunctionalHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
         self.delta = 1.0
         self.shape = (10, 10, 5)
-        self.input_np = np.random.random(size=self.shape).astype(np.float32)
+        self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
         self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
         # get wrong label elem not in {1., -1.}
         self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
-    def run_dynamic_check(self):
+    def run_dynamic_check(self, place=paddle.CPUPlace()):
+        paddle.disable_static(place=place)
         input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np, dtype=paddle.float32)
+        label = paddle.to_tensor(self.label_np, dtype=paddle.float64)
+
         dy_result = paddle.nn.functional.hinge_embedding_loss(input, label)
-        expected = np.mean(
-            np.where(label.numpy() == -1.,
-                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
+        expected = calc_hinge_embedding_loss(self.input_np, self.label_np)
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='sum')
-        expected = np.sum(
-            np.where(label.numpy() == -1.,
-                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
+        expected = calc_hinge_embedding_loss(
+            self.input_np, self.label_np, reduction='sum')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         dy_result = paddle.nn.functional.hinge_embedding_loss(
             input, label, reduction='none')
-        expected = np.where(label.numpy() == -1.,
-                            np.maximum(0., self.delta - input.numpy()),
-                            input.numpy())
+        expected = calc_hinge_embedding_loss(
+            self.input_np, self.label_np, reduction='none')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
+    def run_static_check(self, place=paddle.CPUPlace):
+        paddle.enable_static()
+        for reduction in ['none', 'mean', 'sum']:
+            expected = calc_hinge_embedding_loss(
+                self.input_np, self.label_np, reduction=reduction)
+            with program_guard(Program(), Program()):
+                input = paddle.static.data(
+                    name="input", shape=self.shape, dtype=paddle.float64)
+                label = paddle.static.data(
+                    name="label", shape=self.shape, dtype=paddle.float64)
+                st_result = paddle.nn.functional.hinge_embedding_loss(
+                    input, label, reduction=reduction)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(
+                    feed={"input": self.input_np,
+                          "label": self.label_np},
+                    fetch_list=[st_result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+
     def test_cpu(self):
-        paddle.disable_static(place=paddle.CPUPlace())
-        self.run_dynamic_check()
+        self.run_dynamic_check(place=paddle.CPUPlace())
+        self.run_static_check(place=paddle.CPUPlace())
 
     def test_gpu(self):
         if not paddle.is_compiled_with_cuda():
             return
-
-        paddle.disable_static(place=paddle.CUDAPlace(0))
-        self.run_dynamic_check()
+        self.run_dynamic_check(place=paddle.CUDAPlace(0))
+        self.run_static_check(place=paddle.CUDAPlace(0))
 
     # test case the raise message
     def test_reduce_errors(self):
@@ -81,51 +109,67 @@ class TestClassHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
         self.delta = 1.0
         self.shape = (10, 10, 5)
-        self.input_np = np.random.random(size=self.shape).astype(np.float32)
+        self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
         self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
         # get wrong label elem not in {1., -1.}
         self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
-    def run_dynamic_check(self):
+    def run_dynamic_check(self, place=paddle.CPUPlace()):
+        paddle.disable_static(place=place)
         input = paddle.to_tensor(self.input_np)
-        label = paddle.to_tensor(self.label_np, dtype=paddle.float32)
+        label = paddle.to_tensor(self.label_np, dtype=paddle.float64)
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss()
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.mean(
-            np.where(label.numpy() == -1.,
-                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
+        expected = calc_hinge_embedding_loss(self.input_np, self.label_np)
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='sum')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.sum(
-            np.where(label.numpy() == -1.,
-                     np.maximum(0., self.delta - input.numpy()), input.numpy()))
+        expected = calc_hinge_embedding_loss(
+            self.input_np, self.label_np, reduction='sum')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='none')
         dy_result = hinge_embedding_loss(input, label)
-        expected = np.where(label.numpy() == -1.,
-                            np.maximum(0., self.delta - input.numpy()),
-                            input.numpy())
+        expected = calc_hinge_embedding_loss(
+            self.input_np, self.label_np, reduction='none')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
+    def run_static_check(self, place=paddle.CPUPlace):
+        paddle.enable_static()
+        for reduction in ['none', 'mean', 'sum']:
+            expected = calc_hinge_embedding_loss(
+                self.input_np, self.label_np, reduction=reduction)
+            with program_guard(Program(), Program()):
+                input = paddle.static.data(
+                    name="input", shape=self.shape, dtype=paddle.float64)
+                label = paddle.static.data(
+                    name="label", shape=self.shape, dtype=paddle.float64)
+                hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
+                    reduction=reduction)
+                st_result = hinge_embedding_loss(input, label)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(
+                    feed={"input": self.input_np,
+                          "label": self.label_np},
+                    fetch_list=[st_result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+
     def test_cpu(self):
-        paddle.disable_static(place=paddle.CPUPlace())
-        self.run_dynamic_check()
+        self.run_dynamic_check(place=paddle.CPUPlace())
+        self.run_static_check(place=paddle.CPUPlace())
 
     def test_gpu(self):
         if not paddle.is_compiled_with_cuda():
             return
-
-        paddle.disable_static(place=paddle.CUDAPlace(0))
-        self.run_dynamic_check()
+        self.run_dynamic_check(place=paddle.CUDAPlace(0))
+        self.run_static_check(place=paddle.CUDAPlace(0))
 
     # test case the raise message
     def test_reduce_errors(self):
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 2f7cf5b760b48..c76baa10615b9 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2138,8 +2138,9 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
                                  'hinge_embedding_loss')
 
-    loss = paddle.where(label == 1., input, paddle.to_tensor(0.)) + \
-           paddle.where(label == -1., delta - input, paddle.to_tensor(0.))
+    zero_ = paddle.zeros([1], dtype=input.dtype)
+    loss = paddle.where(label == 1., input, zero_) + \
+           paddle.where(label == -1., paddle.nn.functional.relu(delta - input), zero_)
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)

From 0e323aa59d464c353635d23646c0de4515596be5 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Tue, 7 Dec 2021 16:51:48 +0800
Subject: [PATCH 22/23] fix loss and add static check

---
 .../paddle/fluid/tests/unittests/test_hinge_embedding_loss.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index fe2a5b19047f6..fcd227a3e89c3 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -40,8 +40,6 @@ def setUp(self):
         self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
         self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
-        # get wrong label elem not in {1., -1.}
-        self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
     def run_dynamic_check(self, place=paddle.CPUPlace()):
         paddle.disable_static(place=place)
@@ -112,8 +110,6 @@ def setUp(self):
         self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
         self.label_np = 2 * np.random.randint(0, 2, size=self.shape) - 1.
-        # get wrong label elem not in {1., -1.}
-        self.wrong_label = paddle.randint(-3, 3, shape=self.shape)
 
     def run_dynamic_check(self, place=paddle.CPUPlace()):
         paddle.disable_static(place=place)

From f92f098898fb2c99f1664ed432acc1a6e24eb543 Mon Sep 17 00:00:00 2001
From: skrBang <s_huabang@163.com>
Date: Tue, 14 Dec 2021 13:24:24 +0800
Subject: [PATCH 23/23] delta -> margin

---
 .../tests/unittests/test_hinge_embedding_loss.py   |  8 ++++----
 python/paddle/nn/functional/loss.py                | 12 ++++++------
 python/paddle/nn/layer/loss.py                     | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index fcd227a3e89c3..91c1b45cbca41 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -22,8 +22,8 @@
 np.random.seed(42)
 
 
-def calc_hinge_embedding_loss(input, label, delta=1.0, reduction='mean'):
-    result = np.where(label == -1., np.maximum(0., delta - input), 0.) + \
+def calc_hinge_embedding_loss(input, label, margin=1.0, reduction='mean'):
+    result = np.where(label == -1., np.maximum(0., margin - input), 0.) + \
              np.where(label == 1., input, 0.)
     if reduction == 'none':
         return result
@@ -35,7 +35,7 @@ def calc_hinge_embedding_loss(input, label, delta=1.0, reduction='mean'):
 
 class TestFunctionalHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
-        self.delta = 1.0
+        self.margin = 1.0
         self.shape = (10, 10, 5)
         self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
@@ -105,7 +105,7 @@ def test_value_error():
 
 class TestClassHingeEmbeddingLoss(unittest.TestCase):
     def setUp(self):
-        self.delta = 1.0
+        self.margin = 1.0
         self.shape = (10, 10, 5)
         self.input_np = np.random.random(size=self.shape).astype(np.float64)
         # get label elem in {1., -1.}
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c76baa10615b9..328eb07b5e960 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2053,7 +2053,7 @@ def sigmoid_focal_loss(logit,
     return loss
 
 
-def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
+def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     r"""
     This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
     This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
@@ -2082,9 +2082,9 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
         label (Tensor): Label tensor containing 1 or -1, the data type is float32 or float64.
             The shape of label is the same as the shape of input.
-        delta (float, optional): Specifies the hyperparameter delta to be used.
+        margin (float, optional): Specifies the hyperparameter margin to be used.
             The value determines how large the input need to be to calculate in
-            hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
+            hinge_embedding_loss. When label is -1, Input smaller than margin are minimized with hinge_embedding_loss.
             Default = 1.0
         reduction (str, optional): Indicate how to average the loss by batch_size.
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
@@ -2116,13 +2116,13 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
             # label elements in {1., -1.}
             label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
 
-            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='none')
+            loss = F.hinge_embedding_loss(input, label, margin=1.0, reduction='none')
             print(loss)
             # Tensor([[0., -2., 0.],
             #         [0., -1., 2.],
             #         [1., 1., 1.]])
 
-            loss = F.hinge_embedding_loss(input, label, delta=1.0, reduction='mean')
+            loss = F.hinge_embedding_loss(input, label, margin=1.0, reduction='mean')
             print(loss)
             # Tensor([0.22222222])
     """
@@ -2140,7 +2140,7 @@ def hinge_embedding_loss(input, label, delta=1.0, reduction='mean', name=None):
 
     zero_ = paddle.zeros([1], dtype=input.dtype)
     loss = paddle.where(label == 1., input, zero_) + \
-           paddle.where(label == -1., paddle.nn.functional.relu(delta - input), zero_)
+           paddle.where(label == -1., paddle.nn.functional.relu(margin - input), zero_)
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index c9330ec61ef25..9da41f26969c8 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1231,9 +1231,9 @@ class HingeEmbeddingLoss(Layer):
 
     Parameters:
 
-        delta (float, optional): Specifies the hyperparameter delta to be used.
+        margin (float, optional): Specifies the hyperparameter margin to be used.
             The value determines how large the input need to be to calculate in
-            hinge_embedding_loss. When label is -1, Input smaller than delta are minimized with hinge_embedding_loss.
+            hinge_embedding_loss. When label is -1, Input smaller than margin are minimized with hinge_embedding_loss.
             Default = 1.0
         reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
@@ -1272,22 +1272,22 @@ class HingeEmbeddingLoss(Layer):
             # label elements in {1., -1.}
             label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
 
-            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='none')
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(margin=1.0, reduction='none')
             loss = hinge_embedding_loss(input, label)
             print(loss)
             # Tensor([[0., -2., 0.],
             #         [0., -1., 2.],
             #         [1., 1., 1.]])
 
-            hinge_embedding_loss = nn.HingeEmbeddingLoss(delta=1.0, reduction='mean')
+            hinge_embedding_loss = nn.HingeEmbeddingLoss(margin=1.0, reduction='mean')
             loss = hinge_embedding_loss(input, label)
             print(loss)
             # Tensor([0.22222222])
     """
 
-    def __init__(self, delta=1.0, reduction="mean", name=None):
+    def __init__(self, margin=1.0, reduction="mean", name=None):
         super(HingeEmbeddingLoss, self).__init__()
-        self.delta = delta
+        self.margin = margin
         self.reduction = reduction
         self.name = name
 
@@ -1296,5 +1296,5 @@ def forward(self, input, label):
             input,
             label,
             reduction=self.reduction,
-            delta=self.delta,
+            margin=self.margin,
             name=self.name)