diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index ab7dadb17a54..b69d0c9af0dc 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,7 +16,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# coding: utf-8
 # pylint: disable=too-many-lines
 """Weight updating functions."""
 import logging
@@ -548,7 +548,7 @@ def update_multi_precision(self, index, weight, grad, state):
 
 @register
 class Signum(Optimizer):
-    """The Signum optimizer that takes the sign of gradient or momentum.
+    r"""The Signum optimizer that takes the sign of gradient or momentum.
 
     The optimizer updates the weight by::
 
@@ -556,7 +556,11 @@ class Signum(Optimizer):
         state = momentum * state + (1-momentum)*rescaled_grad
         weight = (1 - lr * wd_lh) * weight - lr * sign(state)
 
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
+    Reference:
+    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
+    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
+
+    See: https://arxiv.org/abs/1802.04434
 
     For details of the update algorithm see
     :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.