diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index 5a150eb06ef8..78e1fa1d9c6a 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -122,7 +122,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple& shape, CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1."; if (d1 == -1) d1 = d0 / d2; if (d2 == -1) d2 = d0 / d1; - CHECK_EQ(d1 * d2, static_cast(d0)) << + CHECK(d1 * d2 == static_cast(d0) || static_cast(d0) == IType(0)) << "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0; tmp.push_back(d1); tmp.push_back(d2); @@ -180,7 +180,7 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs, const ReshapeParam& param_ = nnvm::get(attrs.parsed); CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]"; CHECK_EQ(out_attrs->size(), 1U); - const TShape &dshape = (*in_attrs)[0]; + TShape &dshape = (*in_attrs)[0]; if (dshape.ndim() == 0) return false; TShape oshape; if (param_.shape.ndim() != 0) { @@ -205,11 +205,9 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs, oshape[inf_idx] = dshape.Size() / oshape.Size(); } } else { - if ((*out_attrs)[0].ndim()) { - return ReverseReshapeInferShape(&(*in_attrs)[0], oshape); - } - return false; + return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]); } + ReverseReshapeInferShape(&dshape, oshape); CHECK_EQ(oshape.Size(), dshape.Size()) << "Target shape size is different to source. " << "Target: " << oshape diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py new file mode 100644 index 000000000000..42d65dab5fdc --- /dev/null +++ b/tests/python/gpu/test_gluon_gpu.py @@ -0,0 +1,203 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import sys +import os +import time +import multiprocessing as mp +import unittest +import mxnet as mx +import numpy as np +import unittest +from nose.tools import assert_raises +from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal +from mxnet.base import MXNetError +from mxnet import autograd +from numpy.testing import assert_allclose + +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.insert(0, os.path.join(curr_path, '../unittest')) +from common import setup_module, with_seed, teardown, assert_raises_cudnn_disabled +from test_gluon import * +from test_loss import * +from test_gluon_rnn import * + +set_default_context(mx.gpu(0)) + +def check_rnn_layer(layer): + layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)]) + with mx.gpu(0): + x = mx.nd.ones((10, 16, 30)) + states = layer.begin_state(16) + go, gs = layer(x, states) + + with mx.cpu(0): + x = mx.nd.ones((10, 16, 30)) + states = layer.begin_state(16) + co, cs = layer(x, states) + + # atol of 1e-6 required, as exposed by seed 2124685726 + assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6) + for g, c in zip(gs, cs): + assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6) + + +def check_rnn_layer_w_rand_inputs(layer): + layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)]) + x = mx.nd.uniform(shape=(10, 16, 30)) + with mx.gpu(0): + x = x.copyto(mx.gpu(0)) + states = layer.begin_state(16) + go, gs = layer(x, states) + + with mx.cpu(0): + x = x.copyto(mx.cpu(0)) + states = layer.begin_state(16) + co, cs = layer(x, states) + + assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6) + for g, c in zip(gs, cs): + assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6) + + +@with_seed() +@assert_raises_cudnn_disabled() +def test_rnn_layer(): + check_rnn_layer(gluon.rnn.RNN(100, num_layers=3)) + check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3)) + check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3)) + check_rnn_layer(gluon.rnn.GRU(100, num_layers=3)) + + check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True)) + check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True)) + + +@with_seed() +def test_gluon_ctc_consistency(): + loss = mx.gluon.loss.CTCLoss() + data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0) + cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0)) + gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0)) + + cpu_data = data.copy().as_in_context(mx.cpu(0)) + cpu_data.attach_grad() + with mx.autograd.record(): + l_cpu = loss(cpu_data, cpu_label) + l_cpu.backward() + + gpu_data = data.copyto(mx.gpu(0)) + gpu_data.attach_grad() + with mx.autograd.record(): + l_gpu = loss(gpu_data, gpu_label) + l_gpu.backward() + + assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3) + + +@with_seed() +def test_global_norm_clip_multi_device(): + x1 = mx.nd.ones((3,3), ctx=mx.gpu(0)) + x2 = mx.nd.ones((4,4), ctx=mx.cpu(0)) + norm = gluon.utils.clip_global_norm([x1, x2], 1.0) + assert norm == 5.0 + assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5) + assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5) + + +def _check_batchnorm_result(input, num_devices=1, cuda=False): + from mxnet.gluon.utils import split_and_load + def _find_bn(module): + if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)): + return module + elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)): + return module.module + + raise RuntimeError('BN not found') + + def _syncParameters(bn1, bn2, ctx): + ctx = input.context + bn2.gamma.set_data(bn1.gamma.data(ctx)) + bn2.beta.set_data(bn1.beta.data(ctx)) + bn2.running_mean.set_data(bn1.running_mean.data(ctx)) + bn2.running_var.set_data(bn1.running_var.data(ctx)) + + input1 = input.copy() + input2 = input.copy() + + if cuda: + input1 = input.as_in_context(mx.gpu(0)) + ctx_list = [mx.gpu(i) for i in range(num_devices)] + else: + ctx_list = [mx.cpu(0) for _ in range(num_devices)] + + nch = input.shape[1] + bn1 = mx.gluon.nn.BatchNorm(in_channels=nch) + bn2 = mx.gluon.contrib.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices) + + bn1.initialize(ctx=ctx_list[0]) + bn2.initialize(ctx=ctx_list) + + # using the same values for gamma and beta + #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0]) + + input1.attach_grad() + inputs2 = split_and_load(input2, ctx_list, batch_axis=0) + for xi in inputs2: + xi.attach_grad() + + with mx.autograd.record(): + output1 = bn1(input1) + output2 = [bn2(xi) for xi in inputs2] + loss1 = (output1 ** 2).sum() + loss2 = [(output ** 2).sum() for output in output2] + mx.autograd.backward(loss1) + mx.autograd.backward(loss2) + + output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0) + # assert forwarding + assert_almost_equal(input1.asnumpy(), input2.asnumpy(), atol=1e-3, rtol=1e-3) + assert_almost_equal(output1.asnumpy(), output2.asnumpy(), atol=1e-3, rtol=1e-3) + assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(), + _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(), + atol=1e-3, rtol=1e-3) + assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(), + _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(), + atol=1e-3, rtol=1e-3) + input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0) + assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3) + + +def test_sync_batchnorm(): + def get_num_devices(): + for i in range(100): + try: + mx.nd.zeros((1,), ctx=mx.gpu(i)) + except: + return i + # no need to use SyncBN with 1 gpu + if get_num_devices() < 2: + return + ndev = 2 + # check with unsync version + for i in range(10): + _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)), + num_devices=ndev, cuda=True) + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 99d8d09805e2..637130010737 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -36,11 +36,8 @@ from test_operator import * from test_optimizer import * from test_random import * -from test_gluon import * -from test_loss import * from test_exc_handling import * #from test_rnn import * -from test_gluon_rnn import * from test_sparse_ndarray import * from test_sparse_operator import * from test_ndarray import * @@ -1600,17 +1597,6 @@ def check_rnn_layer_w_rand_inputs(layer): for g, c in zip(gs, cs): assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6) -@with_seed() -@assert_raises_cudnn_disabled() -def test_rnn_layer(): - check_rnn_layer(gluon.rnn.RNN(100, num_layers=3)) - check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3)) - check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3)) - check_rnn_layer(gluon.rnn.GRU(100, num_layers=3)) - - check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True)) - check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True)) - @with_seed() def test_sequence_reverse(): check_sequence_reverse(mx.gpu(0)) @@ -1628,28 +1614,6 @@ def test_autograd_save_memory(): x.backward() -@with_seed() -def test_gluon_ctc_consistency(): - loss = mx.gluon.loss.CTCLoss() - data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0) - cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0)) - gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0)) - - cpu_data = data.copy().as_in_context(mx.cpu(0)) - cpu_data.attach_grad() - with mx.autograd.record(): - l_cpu = loss(cpu_data, cpu_label) - l_cpu.backward() - - gpu_data = data.copyto(mx.gpu(0)) - gpu_data.attach_grad() - with mx.autograd.record(): - l_gpu = loss(gpu_data, gpu_label) - l_gpu.backward() - - assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3) - - @with_seed() def test_cuda_rtc(): source = r''' @@ -1680,16 +1644,6 @@ def test_cuda_rtc(): assert (y.asnumpy() == 12).all() -@with_seed() -def test_global_norm_clip_multi_device(): - x1 = mx.nd.ones((3,3), ctx=mx.gpu(0)) - x2 = mx.nd.ones((4,4), ctx=mx.cpu(0)) - norm = gluon.utils.clip_global_norm([x1, x2], 1.0) - assert norm == 5.0 - assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5) - assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5) - - @with_seed() def test_cross_device_autograd(): x = mx.nd.random.uniform(shape=(10,)) @@ -1844,10 +1798,14 @@ def test_kernel_error_checking(): p.join() assert p.exitcode != 0,\ "Expected a synchronous kernel error from %s(), none seen." % f.__name__ + finally: + mx.nd.waitall() def test_incorrect_gpu(): # Try setting dev_id to a really big number - assert_raises(MXNetError, mx.nd.ones, (2,2), ctx=mx.gpu(100001)) + def check_throw_error(shape, ctx): + mx.nd.ones(shape, ctx=ctx).wait_to_read() + assert_raises(MXNetError, check_throw_error, shape=(2,2), ctx=mx.gpu(100001)) @with_seed() def test_batchnorm_backwards_notrain(): @@ -1908,84 +1866,6 @@ def test_context_num_gpus(): # Test that num_gpus reports at least one GPU, as the test is run on a GPU host. assert mx.context.num_gpus() > 0 -def _check_batchnorm_result(input, num_devices=1, cuda=False): - from mxnet.gluon.utils import split_and_load - def _find_bn(module): - if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)): - return module - elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)): - return module.module - - raise RuntimeError('BN not found') - - def _syncParameters(bn1, bn2, ctx): - ctx = input.context - bn2.gamma.set_data(bn1.gamma.data(ctx)) - bn2.beta.set_data(bn1.beta.data(ctx)) - bn2.running_mean.set_data(bn1.running_mean.data(ctx)) - bn2.running_var.set_data(bn1.running_var.data(ctx)) - - input1 = input.copy() - input2 = input.copy() - - if cuda: - input1 = input.as_in_context(mx.gpu(0)) - ctx_list = [mx.gpu(i) for i in range(num_devices)] - else: - ctx_list = [mx.cpu(0) for _ in range(num_devices)] - - nch = input.shape[1] - bn1 = mx.gluon.nn.BatchNorm(in_channels=nch) - bn2 = mx.gluon.contrib.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices) - - bn1.initialize(ctx=ctx_list[0]) - bn2.initialize(ctx=ctx_list) - - # using the same values for gamma and beta - #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0]) - - input1.attach_grad() - inputs2 = split_and_load(input2, ctx_list, batch_axis=0) - for xi in inputs2: - xi.attach_grad() - - with mx.autograd.record(): - output1 = bn1(input1) - output2 = [bn2(xi) for xi in inputs2] - loss1 = (output1 ** 2).sum() - loss2 = [(output ** 2).sum() for output in output2] - mx.autograd.backward(loss1) - mx.autograd.backward(loss2) - - output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0) - # assert forwarding - assert_almost_equal(input1.asnumpy(), input2.asnumpy(), atol=1e-3, rtol=1e-3) - assert_almost_equal(output1.asnumpy(), output2.asnumpy(), atol=1e-3, rtol=1e-3) - assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(), - _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(), - atol=1e-3, rtol=1e-3) - assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(), - _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(), - atol=1e-3, rtol=1e-3) - input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0) - assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3) - -def test_sync_batchnorm(): - def get_num_devices(): - for i in range(100): - try: - mx.nd.zeros((1,), ctx=mx.gpu(i)) - except: - return i - # no need to use SyncBN with 1 gpu - if get_num_devices() < 2: - return - ndev = 2 - # check with unsync version - for i in range(10): - _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)), - num_devices=ndev, cuda=True) - if __name__ == '__main__': import nose nose.runmodule() diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index fa5de0c68c7d..938cffefab5d 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1943,11 +1943,11 @@ def test_bxor(a, b): test_bmul(a, b) test_bdiv(a, b) ''' - Flaky Test Disabled due to master build failure: - http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline + Flaky Test Disabled due to master build failure: + http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline Github Issue: https://github.com/apache/incubator-mxnet/issues/11838 - - test_bmod(a, b) + + test_bmod(a, b) ''' test_bmod_int(a, b) test_bpow(a, b) @@ -2065,6 +2065,23 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape): assert np.square(exe.grad_dict['data'].asnumpy() - grad_npy.reshape(src_shape)).mean() < 1E-7, \ 'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\ %(str(src_shape), str(shape_args), str(reverse), str(dst_shape)) + + for i in range(len(src_shape)): + holdout_src_shape = list(src_shape) + holdout_src_shape[i] = 0 + holdout_src_shape = tuple(holdout_src_shape) + net = mx.sym.Variable('data') + net = mx.sym.elemwise_add(net.reshape(shape_args, reverse=reverse), mx.sym.ones(shape=dst_shape)) + input_shape, output_shape, __ = net.infer_shape(data=holdout_src_shape) + assert output_shape[0] == dst_shape, \ + 'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \ + 'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse), + str(dst_shape), str(output_shape[0])) + assert input_shape[0] == src_shape, \ + 'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \ + 'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse), + str(dst_shape), str(output_shape[0])) + # Test new api (Using shape) test_cases = [ [(2, 3, 5, 5), (0, -1), False, (2, 75)], @@ -6614,7 +6631,7 @@ def test_diag(): w = np.random.randint(2,9) a_np = np.random.random((h, w)).astype(np.float32) a = mx.nd.array(a_np).astype('float32') - + # k == 0 r = mx.nd.diag(a) assert_almost_equal(r.asnumpy(), np.diag(a_np)) @@ -6657,7 +6674,7 @@ def test_diag(): d = np.random.randint(2,9) a_np = np.random.random((d)) a = mx.nd.array(a_np) - + # k is random k = np.random.randint(-d,d) r = mx.nd.diag(a, k=k) @@ -6724,7 +6741,7 @@ def test_invalid_block_size(): invalid_shape_inp = (n , c, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.depth_to_space, data, block) - + test_invalid_depth_dim() test_invalid_space_dim() test_invalid_block_size() @@ -6770,12 +6787,12 @@ def test_invalid_block_size(): invalid_shape_inp = (n, c, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.space_to_depth, data, block) - + def test_invalid_depth_dim(): invalid_shape_inp = (n, 0, h, w) data = rand_ndarray(invalid_shape_inp, 'default') assertRaises(MXNetError, mx.nd.space_to_depth, data, block) - + test_invalid_space_dim() test_invalid_block_size() test_invalid_depth_dim()