From f0d566f78a21528b9a660549fb35cf09be2085e3 Mon Sep 17 00:00:00 2001 From: Minghao Liu Date: Wed, 4 Dec 2019 08:37:25 +0000 Subject: [PATCH] retrigger --- src/operator/numpy/np_bincount_op.cu | 38 ++++++++++++---------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/src/operator/numpy/np_bincount_op.cu b/src/operator/numpy/np_bincount_op.cu index 14f9c09664a3..e37e9af435b5 100644 --- a/src/operator/numpy/np_bincount_op.cu +++ b/src/operator/numpy/np_bincount_op.cu @@ -78,9 +78,9 @@ void NumpyBincountForwardImpl(const OpContext &ctx, mshadow::Stream *s = ctx.get_stream(); MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, { - DType* d_bin; - DType bin; + DType* h_ptr; DType* d_ptr; + int bin = minlength; d_ptr = data.data().dptr(); Tensor workspace = ctx.requested[0] .get_space_typed(Shape1(1), s); @@ -88,17 +88,14 @@ void NumpyBincountForwardImpl(const OpContext &ctx, bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr); CHECK(is_valid) << "Input should be nonnegative number"; // check invalid input - Tensor workspace1 = ctx.requested[0] - .get_space_typed(Shape1(1), s); - d_bin = reinterpret_cast(workspace1.dptr_); - thrust::device_ptr dptr_s = thrust::device_pointer_cast(d_ptr); - thrust::device_ptr dptr_e = thrust::device_pointer_cast(d_ptr + data_n); - d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e)); - CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost, + h_ptr = reinterpret_cast(malloc(data_n*sizeof(DType))); + CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost, mshadow::Stream::GetStream(s))); CUDA_CALL(cudaStreamSynchronize(mshadow::Stream::GetStream(s))); - - bin = std::max(static_cast(bin+1), minlength); + for (size_t i = 0; i < data_n; i++) { + if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1; + } + free(h_ptr); mxnet::TShape s(1, bin); const_cast(out).Init(s); // set the output shape forcefully }); @@ -124,9 +121,9 @@ void NumpyBincountForwardImpl(const OpContext &ctx, mshadow::Stream *s = ctx.get_stream(); MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, { - DType* d_bin; - DType bin; + DType* h_ptr; DType* d_ptr; + int bin = minlength; d_ptr = data.data().dptr(); Tensor workspace = ctx.requested[0] .get_space_typed(Shape1(1), s); @@ -134,17 +131,14 @@ void NumpyBincountForwardImpl(const OpContext &ctx, bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr); CHECK(is_valid) << "Input should be nonnegative number"; // check invalid input - Tensor workspace1 = ctx.requested[0] - .get_space_typed(Shape1(1), s); - d_bin = reinterpret_cast(workspace1.dptr_); - thrust::device_ptr dptr_s = thrust::device_pointer_cast(d_ptr); - thrust::device_ptr dptr_e = thrust::device_pointer_cast(d_ptr + data_n); - d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e)); - CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost, + h_ptr = reinterpret_cast(malloc(data_n*sizeof(DType))); + CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost, mshadow::Stream::GetStream(s))); CUDA_CALL(cudaStreamSynchronize(mshadow::Stream::GetStream(s))); - - bin = std::max(static_cast(bin+1), minlength); + for (size_t i = 0; i < data_n; i++) { + if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1; + } + free(h_ptr); mxnet::TShape s(1, bin); const_cast(out).Init(s); // set the output shape forcefully });