Skip to content

Commit

Permalink
[CI] run operator tests with naive engine (apache#18252)
Browse files Browse the repository at this point in the history
* run operator tests with naive engine

* fix take tests

* update skip mark

* fix cuda error reset

* adjust tests

* disable parallel testing and naive engine for mkl/mkldnn apache#18244
  • Loading branch information
szha authored and AntiZpvoh committed Jul 6, 2020
1 parent 1b73b11 commit ba0a265
Show file tree
Hide file tree
Showing 45 changed files with 496 additions and 448 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/os_x_staticbuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ jobs:
python3 -m pip install --user -e python
- name: Test project
run: |
python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)'
python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
MXNET_ENGINE_TYPE=NaiveEngine python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
2 changes: 1 addition & 1 deletion 3rdparty/mshadow/mshadow/base.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ extern "C" {
if (e == cudaErrorCudartUnloading) { \
throw dmlc::Error(cudaGetErrorString(e)); \
} \
CHECK(e == cudaSuccess) \
CHECK_EQ(e, cudaSuccess) \
<< "CUDA: " << cudaGetErrorString(e); \
}

Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
/* Code block avoids redefinition of cudaError_t err */ \
do { \
cudaError err = cudaPeekAtLastError(); \
cudaError err = cudaGetLastError(); \
CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
} while (0)
namespace mshadow {
Expand Down
Empty file modified 3rdparty/mshadow/mshadow/half2.h
100755 → 100644
Empty file.
Empty file modified 3rdparty/mshadow/mshadow/tensor.h
100755 → 100644
Empty file.
Empty file modified 3rdparty/mshadow/mshadow/tensor_cpu-inl.h
100755 → 100644
Empty file.
Empty file modified 3rdparty/mshadow/mshadow/tensor_gpu-inl.h
100755 → 100644
Empty file.
File renamed without changes.
81 changes: 59 additions & 22 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -944,11 +944,12 @@ cd_unittest_ubuntu() {
export MXNET_ENABLE_CYTHON=0
export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled

local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}

pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
pytest -m 'serial' --durations=50 --verbose tests/python/unittest
pytest -n 4 --durations=50 --verbose tests/python/quantization

Expand All @@ -958,11 +959,16 @@ cd_unittest_ubuntu() {
# fi

if [[ ${mxnet_variant} = cu* ]]; then
pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/gpu
pytest -m 'serial' --durations=50 --verbose tests/python/gpu

# Adding these here as CI doesn't test all CUDA environments
pytest -n 4 example/image-classification/test_score.py
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -n 4 example/image-classification/test_score.py
# TODO(szha): fix and reenable the hanging issue. tracked in #18098
# integrationtest_ubuntu_gpu_dist_kvstore
fi
Expand All @@ -980,11 +986,26 @@ unittest_ubuntu_python3_cpu() {
export MXNET_SUBGRAPH_VERBOSE=0
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
pytest -n 4 --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
}

unittest_ubuntu_python3_cpu_serial() {
# TODO(szha): delete this and switch to unittest_ubuntu_python3_cpu once #18244 is fixed
set -ex
export PYTHONPATH=./python/
export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
export MXNET_SUBGRAPH_VERBOSE=0
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
pytest --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
}

unittest_ubuntu_python3_cpu_mkldnn() {
set -ex
export PYTHONPATH=./python/
Expand All @@ -993,9 +1014,9 @@ unittest_ubuntu_python3_cpu_mkldnn() {
export MXNET_SUBGRAPH_VERBOSE=0
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
pytest -n 4 --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
# TODO(szha): enable parallel testing and naive engine for ops once #18244 is fixed
pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
}

unittest_ubuntu_python3_gpu() {
Expand All @@ -1007,8 +1028,11 @@ unittest_ubuntu_python3_gpu() {
export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
}

Expand All @@ -1022,9 +1046,12 @@ unittest_ubuntu_python3_gpu_cython() {
export MXNET_ENABLE_CYTHON=1
export MXNET_ENFORCE_CYTHON=1
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
check_cython
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
}

Expand All @@ -1036,8 +1063,11 @@ unittest_ubuntu_python3_gpu_nocudnn() {
export CUDNN_OFF_TEST_ONLY=true
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
}

Expand All @@ -1050,9 +1080,9 @@ unittest_ubuntu_tensorrt_gpu() {
export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
python3 tests/python/tensorrt/lenet5_train.py
pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
pytest -k 'not test_ops' --durations=50 --cov-report xml:tests_trt_gpu.xml --cov-append --verbose --capture=no tests/python/tensorrt/
}

Expand All @@ -1070,8 +1100,8 @@ unittest_ubuntu_python3_quantization_gpu() {
export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
export MXNET_ENABLE_CYTHON=0
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
}

unittest_centos7_cpu_scala() {
Expand Down Expand Up @@ -1213,7 +1243,9 @@ unittest_centos7_cpu() {
set -ex
source /opt/rh/rh-python36/enable
cd /work/mxnet
python -m pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
python -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
MXNET_ENGINE_TYPE=NaiveEngine \
python -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
python -m pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
python -m pytest -n 4 --durations=50 --cov-report xml:tests_train.xml --verbose tests/python/train
}
Expand All @@ -1224,8 +1256,11 @@ unittest_centos7_gpu() {
cd /work/mxnet
export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
export DMLC_LOG_STACK_TRACE_DEPTH=10
export MXNET_GPU_MEM_POOL_TYPE=Unpooled
pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
MXNET_GPU_MEM_POOL_TYPE=Unpooled \
MXNET_ENGINE_TYPE=NaiveEngine \
pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
}

Expand Down Expand Up @@ -1342,7 +1377,9 @@ test_ubuntu_cpu_python3() {
cd /work/mxnet/python
pip3 install -e .
cd /work/mxnet
python3 -m pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
python3 -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
MXNET_ENGINE_TYPE=NaiveEngine \
python3 -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
python3 -m pytest -m 'serial' --durations=50 --verbose tests/python/unittest

popd
Expand Down
8 changes: 7 additions & 1 deletion ci/jenkins/Jenkins_steps.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ def python3_ut(docker_container_name) {
}
}

def python3_ut_serial(docker_container_name) {
timeout(time: max_time, unit: 'MINUTES') {
utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_serial', false)
}
}

def python3_ut_mkldnn(docker_container_name) {
timeout(time: max_time, unit: 'MINUTES') {
utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
Expand Down Expand Up @@ -803,7 +809,7 @@ def test_unix_python3_mkl_cpu(lib_name) {
ws('workspace/ut-python3-cpu') {
try {
utils.unpack_and_init(lib_name, mx_lib, true)
python3_ut('ubuntu_cpu')
python3_ut_serial('ubuntu_cpu')
utils.publish_test_coverage()
} finally {
utils.collect_test_results_unix('tests_unittest.xml', 'tests_python3_cpu_unittest.xml')
Expand Down
1 change: 1 addition & 0 deletions src/engine/naive_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ class NaiveEngine final : public Engine {
if (exec_ctx.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
cudaGetLastError(); // reset cuda error
MSHADOW_CATCH_ERROR(mshadow::SetDevice<gpu>(exec_ctx.dev_id));
if (streams_.size() <= dev_id) {
streams_.resize(dev_id + 1, nullptr);
Expand Down
4 changes: 2 additions & 2 deletions src/operator/bilinear_sampler.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ inline void BilinearSamplerForward(const Tensor<gpu, 4, DType> &output,
cuda::BilinearSamplerForwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
i_c, i_h, i_w, data, grid, o_n, o_c, o_h, o_w, out);
// post kernel check
cudaError err = cudaPeekAtLastError();
cudaError err = cudaGetLastError();
CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
}

Expand Down Expand Up @@ -217,7 +217,7 @@ inline void BilinearSamplerBackward(const Tensor<gpu, 4, DType> &input_grad,
});
});
// post kernel check
cudaError err = cudaPeekAtLastError();
cudaError err = cudaGetLastError();
CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
}

Expand Down
4 changes: 2 additions & 2 deletions src/operator/contrib/deformable_psroi_pooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ namespace cuda {
no_trans, trans_std, sample_per_part, output_dim,
group_size, part_size, num_classes,
channels_each_class, top_data, top_count_data);
DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
}
Expand Down Expand Up @@ -354,7 +354,7 @@ namespace cuda {
bottom_data, bottom_rois, bottom_trans,
no_trans, trans_std, sample_per_part, group_size,
part_size, num_classes, channels_each_class);
DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
}
} // namespace cuda
Expand Down
16 changes: 8 additions & 8 deletions src/operator/contrib/multi_proposal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ void _nms(mshadow::Stream<gpu> *s,
nms_overlap_thresh,
boxes_dev,
mask_dev);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());
std::vector<uint64_t> mask_host(boxes_num * col_blocks);

cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
Expand Down Expand Up @@ -494,7 +494,7 @@ class MultiProposalGPUOp : public Operator{
ProposalGridKernel<<<dimGrid, dimBlock>>>(
count, num_anchors, height, width, param_.feature_stride,
scores.dptr_, workspace_proposals.dptr_);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());

// Transform anchors and bbox_deltas into bboxes
CheckLaunchParam(dimGrid, dimBlock, "BBoxPred");
Expand All @@ -507,13 +507,13 @@ class MultiProposalGPUOp : public Operator{
count, num_anchors, height, width, param_.feature_stride, im_info.dptr_,
workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
}
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());

// filter boxes with less than rpn_min_size
CheckLaunchParam(dimGrid, dimBlock, "FilterBox");
FilterBoxKernel<<<dimGrid, dimBlock>>>(
count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());



Expand Down Expand Up @@ -541,15 +541,15 @@ class MultiProposalGPUOp : public Operator{
CopyScoreKernel << <dimGrid, dimBlock >> >(
count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
score.dptr_, order.dptr_);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());

// argsort score, save order
thrust::stable_sort_by_key(thrust::device,
score.dptr_,
score.dptr_ + score.size(0),
order.dptr_,
thrust::greater<real_t>());
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());

// Reorder proposals according to order

Expand All @@ -558,7 +558,7 @@ class MultiProposalGPUOp : public Operator{
ReorderProposalsKernel << <dimGrid, dimBlock >> >(
rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
order.dptr_, workspace_ordered_proposals.dptr_);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());

// perform nms
std::vector<int> _keep(workspace_ordered_proposals.size(0));
Expand All @@ -580,7 +580,7 @@ class MultiProposalGPUOp : public Operator{
param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
out.dptr_ + b * param_.rpn_post_nms_top_n * 5,
out_score.dptr_ + b * param_.rpn_post_nms_top_n);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
FRCNN_CUDA_CHECK(cudaGetLastError());
}
// free temporary memory
FRCNN_CUDA_CHECK(cudaFree(keep));
Expand Down
2 changes: 1 addition & 1 deletion src/operator/contrib/multibox_detection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
num_classes, num_anchors, threshold, clip,
variances[0], variances[1], variances[2], variances[3],
nms_threshold, force_suppress, nms_topk);
MULTIBOX_DETECTION_CUDA_CHECK(cudaPeekAtLastError());
MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
}
} // namespace mshadow

Expand Down
4 changes: 2 additions & 2 deletions src/operator/contrib/multibox_prior.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
sizes[i], ratio, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
++offset;
}
MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());

// size = sizes[0], various ratios
for (int j = 1; j < num_ratios; ++j) {
Expand All @@ -99,7 +99,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
offset_y, offset_x, stride, offset);
++offset;
}
MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());
}
} // namespace mshadow

Expand Down
Loading

0 comments on commit ba0a265

Please sign in to comment.