diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index 1c4a72682ae5..26b02d890fe3 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -27,3 +27,7 @@ C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 -
 if (! $?) { Throw ("Error running unittest") }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running train tests") }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm
+if (! $?) { Throw ("Error running unittest") }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index 8a6c8e9b44f9..8418bd0b490d 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -31,3 +31,7 @@ C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 -
 if (! $?) { Throw ("Error running tests") }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error tests\python\train
 if (! $?) { Throw ("Error running tests") }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm
+if (! $?) { Throw ("Error running tests") }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index a7067f9f3f83..56bf5c6d4595 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -27,3 +27,7 @@ C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 -
 if (! $?) { Throw ("Error running unittest") }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running train tests") }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm
+if (! $?) { Throw ("Error running unittest") }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index 5fbc9f2f8036..ef07b428fca2 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -31,3 +31,7 @@ C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 -
 if (! $?) { Throw ("Error running tests") }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
 if (! $?) { Throw ("Error running tests") }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm
+if (! $?) { Throw ("Error running tests") }
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index c5ebd54c55a1..c1c23ba969d2 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -280,6 +280,14 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Values: Int ```(default=4)```
   - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
 
+* MXNET_SAFE_ACCUMULATION
+  - Values: Values: 0(false) or 1(true) ```(default=0)```
+  - If this variable is set, the accumulation will enter the safe mode, meaning accumulation is done in a data type of higher precision than
+    the input data type, leading to more accurate accumulation results with a possible performance loss and backward compatibility loss.
+    For example, when the variable is set to 1(true), if the input data type is float16, then the accumulation will be done
+    with float32.
+  - Model accuracies do not necessarily improve with this environment variable turned on.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 9fec6cd1255a..f7d9f13fd869 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1183,12 +1183,23 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
   } else {
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, false);
   }
+
   if (param.ord == 1) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+    if (dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false)) {
+      ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+        ctx, inputs, req, outputs, small);
+    } else {
+      ReduceAxesComputeImpl<xpu, mshadow_op::sum, false, false, mshadow_op::abs>(
         ctx, inputs, req, outputs, small);
+    }
   } else if (param.ord == 2) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
+    if (dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false)) {
+      ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
         ctx, inputs, req, outputs, small);
+    } else {
+      ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, false, mshadow_op::identity>(
+        ctx, inputs, req, outputs, small);
+    }
   }
 }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7db07596d7f8..01798f58f16e 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3482,51 +3482,61 @@ def l2norm(input_data, axis=0, keepdims=True):
     epsilon = 1e-3
     acc_type = {np.float16: np.float32, np.float32: np.float32, np.float64: np.float64,
                 np.int32: np.int32, np.int64: np.int64}
+    dtype_to_str = {np.float16: 'float16', np.float32: 'float32', np.float64: 'float64',
+                    np.int32: 'int32', np.int64: 'int64'}
     is_windows = sys.platform.startswith('win')
-    for order in [1, 2]:
-        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-            for i in range(in_data_dim):
-                for out_dtype in ['float32', 'float64', 'int32', 'int64']:
-                    if (dtype == np.int32 or dtype == np.int64) and ('int' not in out_dtype or is_windows):
-                        continue
-                    if dtype != np.int32 and dtype != np.int64 and 'int' in out_dtype:
-                        continue
-                    backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
-                    skip_backward = 'int' in out_dtype
-                    print(order, dtype, i, out_dtype, in_shape)
-                    in_data = np.random.uniform(-1, 1, in_shape).astype(acc_type[dtype])
-                    in_data[abs(in_data) < epsilon] = 2 * epsilon
-                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
-                    npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
-                    npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                    check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
-                                           rtol=1e-3, atol=1e-5, ctx=ctx)
-                    if not skip_backward:
-                        check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
-                                                [np.ones(npy_out.shape).astype(out_dtype)],
-                                                [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
-                                                dtype=backward_dtype)
-                    # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
-                    # check gradient
-                    if dtype is not np.float16 and not skip_backward:
-                        check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
-                                               rtol=1e-1, atol=1e-3, dtype=backward_dtype)
-                    if i < in_data_dim-1:
-                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
-                        npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+    for enforce_safe_acc in ["1", "0"]:
+        if is_windows:
+            if enforce_safe_acc == "0":
+                break
+            enforce_safe_acc = "0" if "MXNET_SAFE_ACCUMULATION" not in os.environ else os.environ["MXNET_SAFE_ACCUMULATION"]
+        else:
+            os.environ["MXNET_SAFE_ACCUMULATION"] = enforce_safe_acc
+        for order in [1, 2]:
+            for dtype in [np.float16, np.float32, np.float64]:
+                for i in range(in_data_dim):
+                    for out_dtype in ['float32', 'float64']:
+                        backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
+                        accumulation_type = acc_type[dtype]
+                        if enforce_safe_acc == "0":
+                            backward_dtype = dtype
+                            out_dtype = dtype_to_str[dtype]
+                            accumulation_type = dtype
+                        skip_backward = 'int' in out_dtype
+                        in_data = np.random.uniform(-1, 1, in_shape).astype(accumulation_type)
+                        in_data[abs(in_data) < epsilon] = 2 * epsilon
+                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
+                        npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
                         npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                        check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
-                                               rtol=1e-3 if dtype is np.float16 else 1e-3,
-                                               atol=1e-5 if dtype is np.float16 else 1e-5, ctx=ctx)
-                        if not skip_backward:
-                            check_symbolic_backward(norm_sym, [in_data],
+                        check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
+                                               rtol=1e-2 if dtype == np.float16 else 1e-3,
+                                               atol=1e-4 if dtype == np.float16 else 1e-5, ctx=ctx, dtype=dtype)
+                        if dtype is not np.float16 and not skip_backward:
+                            check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
                                                     [np.ones(npy_out.shape).astype(out_dtype)],
-                                                    [npy_out_backward.astype(out_dtype)],
-                                                    rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                                                    [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
+                                                    dtype=backward_dtype)
+                        # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
                         # check gradient
                         if dtype is not np.float16 and not skip_backward:
                             check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
                                                    rtol=1e-1, atol=1e-3, dtype=backward_dtype)
+                        if i < in_data_dim-1:
+                            norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
+                            npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                            npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
+                            check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
+                                                   rtol=1e-2 if dtype is np.float16 else 1e-3,
+                                                   atol=1e-4 if dtype is np.float16 else 1e-5, ctx=ctx)
+                            if dtype is not np.float16 and not skip_backward:
+                                check_symbolic_backward(norm_sym, [in_data],
+                                                        [np.ones(npy_out.shape).astype(out_dtype)],
+                                                        [npy_out_backward.astype(out_dtype)],
+                                                        rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                            # check gradient
+                            if dtype is not np.float16 and not skip_backward:
+                                check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                                       rtol=1e-1, atol=1e-3, dtype=backward_dtype)
 
 
 def test_layer_norm():