From f0d566f78a21528b9a660549fb35cf09be2085e3 Mon Sep 17 00:00:00 2001
From: Minghao Liu <tommliu96@gmail.com>
Date: Wed, 4 Dec 2019 08:37:25 +0000
Subject: [PATCH] retrigger

---
 src/operator/numpy/np_bincount_op.cu | 38 ++++++++++++----------------
 1 file changed, 16 insertions(+), 22 deletions(-)
diff --git a/src/operator/numpy/np_bincount_op.cu b/src/operator/numpy/np_bincount_op.cu
index 14f9c09664a3..e37e9af435b5 100644
--- a/src/operator/numpy/np_bincount_op.cu
+++ b/src/operator/numpy/np_bincount_op.cu
@@ -78,9 +78,9 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
 
   MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
-    DType* d_bin;
-    DType bin;
+    DType* h_ptr;
     DType* d_ptr;
+    int bin = minlength;
     d_ptr = data.data().dptr<DType>();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
             .get_space_typed<gpu, 1, char>(Shape1(1), s);
@@ -88,17 +88,14 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
     bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
     CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
 
-    Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
-            .get_space_typed<gpu, 1, DType>(Shape1(1), s);
-    d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
-    thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
-    thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
-    d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
-    CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
                               mshadow::Stream<gpu>::GetStream(s)));
     CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
-
-    bin = std::max(static_cast<int>(bin+1), minlength);
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
     mxnet::TShape s(1, bin);
     const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
   });
@@ -124,9 +121,9 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
 
   MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
-    DType* d_bin;
-    DType bin;
+    DType* h_ptr;
     DType* d_ptr;
+    int bin = minlength;
     d_ptr = data.data().dptr<DType>();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
             .get_space_typed<gpu, 1, char>(Shape1(1), s);
@@ -134,17 +131,14 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
     bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
     CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
 
-    Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
-            .get_space_typed<gpu, 1, DType>(Shape1(1), s);
-    d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
-    thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
-    thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
-    d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
-    CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
                               mshadow::Stream<gpu>::GetStream(s)));
     CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
-
-    bin = std::max(static_cast<int>(bin+1), minlength);
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
     mxnet::TShape s(1, bin);
     const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
   });