retrigger

apache · Dec 6, 2019 · f0d566f · f0d566f
1 parent a6df6e9
commit f0d566f
Showing 1 changed file with 16 additions and 22 deletions.
diff --git a/src/operator/numpy/np_bincount_op.cu b/src/operator/numpy/np_bincount_op.cu
@@ -78,27 +78,24 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
 
   MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
-    DType* d_bin;
-    DType bin;
+    DType* h_ptr;
     DType* d_ptr;
+    int bin = minlength;
     d_ptr = data.data().dptr<DType>();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
             .get_space_typed<gpu, 1, char>(Shape1(1), s);
     char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
     bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
     CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
 
-    Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
-            .get_space_typed<gpu, 1, DType>(Shape1(1), s);
-    d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
-    thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
-    thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
-    d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
-    CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
                               mshadow::Stream<gpu>::GetStream(s)));
     CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
-
-    bin = std::max(static_cast<int>(bin+1), minlength);
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
     mxnet::TShape s(1, bin);
     const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
   });
@@ -124,27 +121,24 @@ void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
 
   MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
-    DType* d_bin;
-    DType bin;
+    DType* h_ptr;
     DType* d_ptr;
+    int bin = minlength;
     d_ptr = data.data().dptr<DType>();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
             .get_space_typed<gpu, 1, char>(Shape1(1), s);
     char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
     bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
     CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
 
-    Tensor<gpu, 1, DType> workspace1 = ctx.requested[0]
-            .get_space_typed<gpu, 1, DType>(Shape1(1), s);
-    d_bin = reinterpret_cast<DType*>(workspace1.dptr_);
-    thrust::device_ptr<DType> dptr_s = thrust::device_pointer_cast(d_ptr);
-    thrust::device_ptr<DType> dptr_e = thrust::device_pointer_cast(d_ptr + data_n);
-    d_bin = thrust::raw_pointer_cast(thrust::max_element(dptr_s, dptr_e));
-    CUDA_CALL(cudaMemcpyAsync(&bin, d_bin, sizeof(DType), cudaMemcpyDeviceToHost,
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
                               mshadow::Stream<gpu>::GetStream(s)));
     CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
-
-    bin = std::max(static_cast<int>(bin+1), minlength);
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
     mxnet::TShape s(1, bin);
     const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
   });