diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 7b5832e1ace6..29d0105ae8dd 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -99,11 +99,11 @@ def hybrid_forward(self, F, x, *args, **kwargs):
 
 
 class L2Loss(Loss):
-    r"""Calculates the mean squared error between `pred` and `label`.
+    r"""Calculates the mean squared error between `label` and `pred`.
 
-    .. math:: L = \frac{1}{2} \sum_i \vert {pred}_i - {label}_i \vert^2.
+    .. math:: L = \frac{1}{2} \sum_i \vert {label}_i - {pred}_i \vert^2.
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -131,17 +131,17 @@ def __init__(self, weight=1., batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.square(pred - label)
+        loss = F.square(label - pred)
         loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 class L1Loss(Loss):
-    r"""Calculates the mean absolute error between `pred` and `label`.
+    r"""Calculates the mean absolute error between `label` and `pred`.
 
-    .. math:: L = \sum_i \vert {pred}_i - {label}_i \vert.
+    .. math:: L = \sum_i \vert {label}_i - {pred}_i \vert.
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -169,7 +169,7 @@ def __init__(self, weight=None, batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.abs(pred - label)
+        loss = F.abs(label - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -195,7 +195,7 @@ class SigmoidBinaryCrossEntropyLoss(Loss):
             (1 - {label}_i) * \log(1 - {pred}_i)
 
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -344,7 +344,7 @@ class KLDivLoss(Loss):
         L = \sum_i {label}_i * \big[\log({label}_i) - log({pred}_i)\big]
 
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -481,13 +481,13 @@ class HuberLoss(Loss):
     exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.
 
     .. math::
-        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({pred}_i - {label}_i)^2 &
-                           \text{ if } |{pred}_i - {label}_i| < {rho} \\
-                           |{pred}_i - {label}_i| - \frac{{rho}}{2} &
+        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({label}_i - {pred}_i)^2 &
+                           \text{ if } |{label}_i - {pred}_i| < {rho} \\
+                           |{label}_i - {pred}_i| - \frac{{rho}}{2} &
                            \text{ otherwise }
             \end{cases}
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -518,7 +518,7 @@ def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.abs(pred - label)
+        loss = F.abs(label - pred)
         loss = F.where(loss > self._rho, loss - 0.5 * self._rho,
                        (0.5/self._rho) * F.square(loss))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
@@ -532,7 +532,7 @@ class HingeLoss(Loss):
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)
 
     where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` must have the same number of
+    containing values -1 or 1. `label` and `pred` must have the same number of
     elements.
 
     Parameters
@@ -576,7 +576,7 @@ class SquaredHingeLoss(Loss):
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)^2
 
     where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` can have arbitrary shape as
+    containing values -1 or 1. `label` and `pred` can have arbitrary shape as
     long as they have the same number of elements.
 
     Parameters
@@ -621,7 +621,7 @@ class LogisticLoss(Loss):
 
     where `pred` is the classifier prediction and `label` is the target tensor
     containing values -1 or 1 (0 or 1 if `label_format` is binary).
-    `pred` and `label` can have arbitrary shape as long as they have the same number of elements.
+    `label` and `pred` can have arbitrary shape as long as they have the same number of elements.
 
     Parameters
     ----------
@@ -666,14 +666,14 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
 
 class TripletLoss(Loss):
     r"""Calculates triplet loss given three input tensors and a positive margin.
-    Triplet loss measures the relative similarity between prediction, a positive
-    example and a negative example:
+    Triplet loss measures the relative similarity between a positive
+    example, a negative example, and prediction:
 
     .. math::
-        L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
-                        \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)
+        L = \sum_i \max(\Vert {pos_i}_i - {pred} \Vert_2^2 -
+                        \Vert {neg_i}_i - {pred} \Vert_2^2 + {margin}, 0)
 
-    `pred`, `positive` and `negative` can have arbitrary shape as long as they
+    `positive`, `negative`, and 'pred' can have arbitrary shape as long as they
     have the same number of elements.
 
     Parameters
@@ -703,7 +703,7 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
     def hybrid_forward(self, F, pred, positive, negative):
         positive = _reshape_like(F, positive, pred)
         negative = _reshape_like(F, negative, pred)
-        loss = F.sum(F.square(pred-positive) - F.square(pred-negative),
+        loss = F.sum(F.square(positive-pred) - F.square(negative-pred),
                      axis=self._batch_axis, exclude=True)
         loss = F.relu(loss + self._margin)
         return _apply_weighting(F, loss, self._weight, None)
@@ -717,7 +717,7 @@ class PoissonNLLLoss(Loss):
     .. math::
         L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})
 
-    `pred`, `target` can have arbitrary shape as long as they have the same number of elements.
+    `target`, 'pred' can have arbitrary shape as long as they have the same number of elements.
 
     Parameters
     ----------