diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h index 5f8203e824a3..e19dbe259dff 100644 --- a/src/operator/softmax_output-inl.h +++ b/src/operator/softmax_output-inl.h @@ -53,6 +53,7 @@ struct SoftmaxOutputParam : public dmlc::Parameter { bool preserve_shape; int normalization; bool out_grad; + float smooth_alpha; DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) { DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f) .describe("Scales the gradient by a float factor."); @@ -78,6 +79,10 @@ struct SoftmaxOutputParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(out_grad) .set_default(false) .describe("Multiplies gradient with output gradient element-wise."); + DMLC_DECLARE_FIELD(smooth_alpha) + .set_default(0.0f) + .set_range(0.0f,1.0f) + .describe("Constant for smoothed cross-entropy gradients."); }; }; @@ -215,9 +220,17 @@ class SoftmaxOutputOp : public Operator { in_grad[softmaxout_enum::kData].get_with_shape(data_shape, s); index_t valid_cnt = label.shape_.Size(); if (param_.use_ignore) { - SoftmaxGrad(grad, out, label, static_cast(param_.ignore_label)); + if (param_.smooth_alpha == 0.0f) { + SoftmaxGrad(grad, out, label, static_cast(param_.ignore_label)); + } else { + SmoothSoftmaxGrad(grad, out, label, static_cast(param_.ignore_label), param_.smooth_alpha); + } } else { - SoftmaxGrad(grad, out, label); + if (param_.smooth_alpha == 0.0f) { + SoftmaxGrad(grad, out, label); + } else { + SmoothSoftmaxGrad(grad, out, label, param_.smooth_alpha); + } } if (param_.normalization == softmaxout_enum::kBatch) { valid_cnt = label.size(0); diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 0c528e0b180f..fd88bb42ea75 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -235,6 +235,55 @@ def test_regression(): lambda x, y : x - y) +def check_softmax_grad(): + x = mx.sym.Variable('x') + label = mx.sym.Variable('label') + x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=default_context()) + grad_x = mx.nd.zeros((1,4), ctx=default_context()) + label_nd = mx.nd.array([1], ctx=default_context()) + + sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False) + ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) + + ex.forward(is_train=True) + softmax_out = ex.outputs[0].asnumpy() + expected_softmax_out = mx.nd.softmax(x_nd).asnumpy() + print(softmax_out) + assert np.isclose(softmax_out, expected_softmax_out).all() + + ex.backward(is_train=True) + grad_out = ex.grad_arrays[0].asnumpy() + k = int(label_nd[0].asscalar()) + expected_grad_out = np.zeros((1,4)) + expected_grad_out[0, k] = -1 + assert np.isclose(grad_out - softmax_out, expected_grad_out).all() + + +def check_smoothed_softmax_grad(): + alpha = 0.2 + x = mx.sym.Variable('x') + label = mx.sym.Variable('label') + x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=default_context()) + grad_x = mx.nd.zeros((1,4), ctx=default_context()) + label_nd = mx.nd.array([1], ctx=default_context()) + + sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False, smooth_alpha=alpha) + ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x}) + + ex.forward(is_train=True) + softmax_out = ex.outputs[0].asnumpy() + expected_softmax_out = mx.nd.softmax(x_nd).asnumpy() + print(softmax_out) + assert np.isclose(softmax_out, expected_softmax_out).all() + + ex.backward(is_train=True) + grad_out = ex.grad_arrays[0].asnumpy() + k = int(label_nd[0].asscalar()) + expected_grad_out = np.full((1,4), fill_value=-alpha/(4-1)) + expected_grad_out[0, k] = - (1 - alpha) + assert np.isclose(grad_out - softmax_out, expected_grad_out).all() + + def check_softmax_with_ignore_label(xpu): X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') @@ -289,6 +338,8 @@ def test_softmax(): check_softmax_with_shape((3, 4), default_context(), preserve_shape=False) check_softmax_with_shape((3, 4), default_context(), preserve_shape=True) check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True) + check_smoothed_softmax_grad() + check_smoothed_softmax_grad() def test_python_op():