Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
retrigger
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommliu committed Dec 6, 2019
1 parent a6df6e9 commit f0d566f
Showing 1 changed file with 16 additions and 22 deletions.
38 changes: 16 additions & 22 deletions src/operator/numpy/np_bincount_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,27 +78,24 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();

MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
DType* d_bin;
DType bin;
DType* h_ptr;
DType* d_ptr;
int bin = minlength;
d_ptr = data.data().dptr<DType>();
Tensor<gpu, 1, char> workspace = ctx.requested[0]
.get_space_typed<gpu, 1, char>(Shape1(1), s);
char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
CHECK(is_valid) << "Input should be nonnegative number"; // check invalid input

Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
.get_space_typed<gpu, 1, DType>(Shape1(1), s);
d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
mshadow::Stream<gpu>::GetStream(s)));
CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));

bin = std::max(static_cast<int>(bin+1), minlength);
for (size_t i = 0; i < data_n; i++) {
if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
}
free(h_ptr);
mxnet::TShape s(1, bin);
const_cast<NDArray &>(out).Init(s); // set the output shape forcefully
});
Expand All @@ -124,27 +121,24 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();

MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
DType* d_bin;
DType bin;
DType* h_ptr;
DType* d_ptr;
int bin = minlength;
d_ptr = data.data().dptr<DType>();
Tensor<gpu, 1, char> workspace = ctx.requested[0]
.get_space_typed<gpu, 1, char>(Shape1(1), s);
char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
CHECK(is_valid) << "Input should be nonnegative number"; // check invalid input

Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
.get_space_typed<gpu, 1, DType>(Shape1(1), s);
d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
mshadow::Stream<gpu>::GetStream(s)));
CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));

bin = std::max(static_cast<int>(bin+1), minlength);
for (size_t i = 0; i < data_n; i++) {
if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
}
free(h_ptr);
mxnet::TShape s(1, bin);
const_cast<NDArray &>(out).Init(s); // set the output shape forcefully
});
Expand Down

0 comments on commit f0d566f

Please sign in to comment.