diff --git a/contrib/clojure-package/src/dev/generator.clj b/contrib/clojure-package/src/dev/generator.clj
index 34210bef63d0..864c67ff6bcd 100644
--- a/contrib/clojure-package/src/dev/generator.clj
+++ b/contrib/clojure-package/src/dev/generator.clj
@@ -123,7 +123,11 @@
     (.write w "\n\n")
     (.write w "\n\n")
   (doseq [f functions]
-    (clojure.pprint/pprint f w)
+    (let [fstr (-> f
+                   clojure.pprint/pprint
+                   with-out-str
+                   (clojure.string/replace #"\\n\\n" "\n"))]
+      (.write w fstr))
     (.write w "\n"))))
 
 ;;;;;;; Common operations
@@ -447,7 +451,10 @@
                             :type "Map[String, String]"
                             :optional? true
                             :description "Attributes of the symbol"}))
-        doc (gen-symbol-api-doc fn-description params)
+        doc (clojure.string/join
+             "\n\n  "
+             (-> (gen-symbol-api-doc fn-description params)
+                 (clojure.string/split #"\n")))
         default-call (gen-symbol-api-default-arity op-name params)]
     `(~'defn ~(symbol fn-name)
       ~doc
@@ -520,7 +527,10 @@
                                  :type "NDArray-or-Symbol"
                                  :optional? true
                                  :description "Output array."}))
-        doc (gen-ndarray-api-doc fn-description params)
+        doc (clojure.string/join
+             "\n\n  "
+             (-> (gen-ndarray-api-doc fn-description params)
+                 (clojure.string/split #"\n")))
         opt-params (filter :optional? params)
         req-params (remove :optional? params)
         req-call (gen-ndarray-api-required-arity fn-name req-params)
diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
index 1b83a7beb7bc..7554089d0ba0 100644
--- a/contrib/clojure-package/test/good-test-ndarray-api.clj
+++ b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -31,7 +31,23 @@
 
 (defn
  activation
- "Applies an activation function element-wise to the input.\n\nThe following activation functions are supported:\n\n- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`\n- `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`\n- `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`\n- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`\n- `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`\n\n\n\nDefined in src/operator/nn/activation.cc:L167\n\n`data`: The input array.\n`act-type`: Activation function to be applied.\n`out`: Output array. (optional)\n"
+ "Applies an activation function element-wise to the input.
+  
+  The following activation functions are supported:
+  
+  - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`
+  - `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`
+  - `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`
+  - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
+  - `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`
+  
+  
+  
+  Defined in src/operator/nn/activation.cc:L167
+  
+  `data`: The input array.
+  `act-type`: Activation function to be applied.
+  `out`: Output array. (optional)"
  ([data act-type] (activation {:data data, :act-type act-type}))
  ([{:keys [data act-type out], :or {out nil}, :as opts}]
   (util/coerce-return
@@ -39,7 +55,72 @@
 
 (defn
  batch-norm
- "Batch normalization.\n\nNormalizes a data batch by mean and variance, and applies a scale ``gamma`` as\nwell as offset ``beta``.\n\nAssume the input has more than one dimension and we normalize along axis 1.\nWe first compute the mean and variance along this axis:\n\n.. math::\n\n  data\\_mean[i] = mean(data[:,i,:,...]) \\\\\n  data\\_var[i] = var(data[:,i,:,...])\n\nThen compute the normalized output, which has the same shape as input, as following:\n\n.. math::\n\n  out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]\n\nBoth *mean* and *var* returns a scalar by treating the input as a vector.\n\nAssume the input has size *k* on axis 1, then both ``gamma`` and ``beta``\nhave shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and\nthe inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these\ntwo outputs are blocked.\n\nBesides the inputs and the outputs, this operator accepts two auxiliary\nstates, ``moving_mean`` and ``moving_var``, which are *k*-length\nvectors. They are global statistics for the whole dataset, which are updated\nby::\n\n  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)\n  moving_var = moving_var * momentum + data_var * (1 - momentum)\n\nIf ``use_global_stats`` is set to be true, then ``moving_mean`` and\n``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute\nthe output. It is often used during inference.\n\nThe parameter ``axis`` specifies which axis of the input shape denotes\nthe 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel\naxis to be the last item in the input shape.\n\nBoth ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,\nthen set ``gamma`` to 1 and its gradient to 0.\n\n.. Note::\n  When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,\n  the sparse tensors will fallback.\n\n\n\nDefined in src/operator/nn/batch_norm.cc:L574\n\n`data`: Input data to batch normalization\n`gamma`: gamma array\n`beta`: beta array\n`moving-mean`: running mean of input\n`moving-var`: running variance of input\n`eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)\n`momentum`: Momentum for moving average (optional)\n`fix-gamma`: Fix gamma while training (optional)\n`use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)\n`output-mean-var`: Output the mean and inverse std  (optional)\n`axis`: Specify which shape axis the channel is specified (optional)\n`cudnn-off`: Do not select CUDNN operator, if available (optional)\n`out`: Output array. (optional)\n"
+ "Batch normalization.
+  
+  Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+  well as offset ``beta``.
+  
+  Assume the input has more than one dimension and we normalize along axis 1.
+  We first compute the mean and variance along this axis:
+  
+  .. math::
+  
+    data\\_mean[i] = mean(data[:,i,:,...]) \\\\
+    data\\_var[i] = var(data[:,i,:,...])
+  
+  Then compute the normalized output, which has the same shape as input, as following:
+  
+  .. math::
+  
+    out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]
+  
+  Both *mean* and *var* returns a scalar by treating the input as a vector.
+  
+  Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+  have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+  the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
+  two outputs are blocked.
+  
+  Besides the inputs and the outputs, this operator accepts two auxiliary
+  states, ``moving_mean`` and ``moving_var``, which are *k*-length
+  vectors. They are global statistics for the whole dataset, which are updated
+  by::
+  
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+  
+  If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+  ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+  the output. It is often used during inference.
+  
+  The parameter ``axis`` specifies which axis of the input shape denotes
+  the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+  axis to be the last item in the input shape.
+  
+  Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+  then set ``gamma`` to 1 and its gradient to 0.
+  
+  .. Note::
+    When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,
+    the sparse tensors will fallback.
+  
+  
+  
+  Defined in src/operator/nn/batch_norm.cc:L574
+  
+  `data`: Input data to batch normalization
+  `gamma`: gamma array
+  `beta`: beta array
+  `moving-mean`: running mean of input
+  `moving-var`: running variance of input
+  `eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)
+  `momentum`: Momentum for moving average (optional)
+  `fix-gamma`: Fix gamma while training (optional)
+  `use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)
+  `output-mean-var`: Output the mean and inverse std  (optional)
+  `axis`: Specify which shape axis the channel is specified (optional)
+  `cudnn-off`: Do not select CUDNN operator, if available (optional)
+  `out`: Output array. (optional)"
  ([data gamma beta moving-mean moving-var]
   (batch-norm
    {:data data,
diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj
index a03088486ee8..c7450f8eb5c1 100644
--- a/contrib/clojure-package/test/good-test-symbol-api.clj
+++ b/contrib/clojure-package/test/good-test-symbol-api.clj
@@ -31,7 +31,24 @@
 
 (defn
  activation
- "Applies an activation function element-wise to the input.\n\nThe following activation functions are supported:\n\n- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`\n- `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`\n- `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`\n- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`\n- `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`\n\n\n\nDefined in src/operator/nn/activation.cc:L167\n\n`data`: The input array. (optional)\n`act-type`: Activation function to be applied.\n`name`: Name of the symbol (optional)\n`attr`: Attributes of the symbol (optional)\n"
+ "Applies an activation function element-wise to the input.
+  
+  The following activation functions are supported:
+  
+  - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`
+  - `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`
+  - `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`
+  - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
+  - `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`
+  
+  
+  
+  Defined in src/operator/nn/activation.cc:L167
+  
+  `data`: The input array. (optional)
+  `act-type`: Activation function to be applied.
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
  [{:keys [data act-type name attr],
    :or {data nil, name nil, attr nil},
    :as opts}]
@@ -51,7 +68,73 @@
 
 (defn
  batch-norm
- "Batch normalization.\n\nNormalizes a data batch by mean and variance, and applies a scale ``gamma`` as\nwell as offset ``beta``.\n\nAssume the input has more than one dimension and we normalize along axis 1.\nWe first compute the mean and variance along this axis:\n\n.. math::\n\n  data\\_mean[i] = mean(data[:,i,:,...]) \\\\\n  data\\_var[i] = var(data[:,i,:,...])\n\nThen compute the normalized output, which has the same shape as input, as following:\n\n.. math::\n\n  out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]\n\nBoth *mean* and *var* returns a scalar by treating the input as a vector.\n\nAssume the input has size *k* on axis 1, then both ``gamma`` and ``beta``\nhave shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and\nthe inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these\ntwo outputs are blocked.\n\nBesides the inputs and the outputs, this operator accepts two auxiliary\nstates, ``moving_mean`` and ``moving_var``, which are *k*-length\nvectors. They are global statistics for the whole dataset, which are updated\nby::\n\n  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)\n  moving_var = moving_var * momentum + data_var * (1 - momentum)\n\nIf ``use_global_stats`` is set to be true, then ``moving_mean`` and\n``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute\nthe output. It is often used during inference.\n\nThe parameter ``axis`` specifies which axis of the input shape denotes\nthe 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel\naxis to be the last item in the input shape.\n\nBoth ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,\nthen set ``gamma`` to 1 and its gradient to 0.\n\n.. Note::\n  When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,\n  the sparse tensors will fallback.\n\n\n\nDefined in src/operator/nn/batch_norm.cc:L574\n\n`data`: Input data to batch normalization (optional)\n`gamma`: gamma array (optional)\n`beta`: beta array (optional)\n`moving-mean`: running mean of input (optional)\n`moving-var`: running variance of input (optional)\n`eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)\n`momentum`: Momentum for moving average (optional)\n`fix-gamma`: Fix gamma while training (optional)\n`use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)\n`output-mean-var`: Output the mean and inverse std  (optional)\n`axis`: Specify which shape axis the channel is specified (optional)\n`cudnn-off`: Do not select CUDNN operator, if available (optional)\n`name`: Name of the symbol (optional)\n`attr`: Attributes of the symbol (optional)\n"
+ "Batch normalization.
+  
+  Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+  well as offset ``beta``.
+  
+  Assume the input has more than one dimension and we normalize along axis 1.
+  We first compute the mean and variance along this axis:
+  
+  .. math::
+  
+    data\\_mean[i] = mean(data[:,i,:,...]) \\\\
+    data\\_var[i] = var(data[:,i,:,...])
+  
+  Then compute the normalized output, which has the same shape as input, as following:
+  
+  .. math::
+  
+    out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]
+  
+  Both *mean* and *var* returns a scalar by treating the input as a vector.
+  
+  Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+  have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+  the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
+  two outputs are blocked.
+  
+  Besides the inputs and the outputs, this operator accepts two auxiliary
+  states, ``moving_mean`` and ``moving_var``, which are *k*-length
+  vectors. They are global statistics for the whole dataset, which are updated
+  by::
+  
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+  
+  If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+  ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+  the output. It is often used during inference.
+  
+  The parameter ``axis`` specifies which axis of the input shape denotes
+  the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+  axis to be the last item in the input shape.
+  
+  Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+  then set ``gamma`` to 1 and its gradient to 0.
+  
+  .. Note::
+    When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,
+    the sparse tensors will fallback.
+  
+  
+  
+  Defined in src/operator/nn/batch_norm.cc:L574
+  
+  `data`: Input data to batch normalization (optional)
+  `gamma`: gamma array (optional)
+  `beta`: beta array (optional)
+  `moving-mean`: running mean of input (optional)
+  `moving-var`: running variance of input (optional)
+  `eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)
+  `momentum`: Momentum for moving average (optional)
+  `fix-gamma`: Fix gamma while training (optional)
+  `use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)
+  `output-mean-var`: Output the mean and inverse std  (optional)
+  `axis`: Specify which shape axis the channel is specified (optional)
+  `cudnn-off`: Do not select CUDNN operator, if available (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
  [{:keys
    [data
     gamma