Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
address code reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
Hao Jin committed Jul 22, 2018
1 parent d0482f9 commit 437da0e
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 21 deletions.
12 changes: 12 additions & 0 deletions src/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,18 @@ constexpr size_t MaxIntegerValue<mshadow::half::half_t>() {
return size_t(2) << 10;
}

MSHADOW_XINLINE int ilog2ul(size_t a) {
int k = 1;
while (a >>= 1) ++k;
return k;
}

MSHADOW_XINLINE int ilog2ui(unsigned int a) {
int k = 1;
while (a >>= 1) ++k;
return k;
}

} // namespace common
} // namespace mxnet
#endif // MXNET_COMMON_UTILS_H_
14 changes: 4 additions & 10 deletions src/operator/tensor/dot-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,11 @@
#include "./sort_op.h"
#include "./util/tensor_util-inl.h"
#include "./util/tensor_util-inl.cuh"
#include "../../common/utils.h"

namespace mxnet {
namespace op {

// Returns integer log2(a) rounded up
inline int log2i(size_t a) {
int k = 1;
while (a >>= 1) k++;
return k;
}

/*!
* \brief GPU scalar kernel of dot(csr, dns1) = dns2
* Parallelization by output matrix elements: 1 thread/element
Expand Down Expand Up @@ -496,7 +490,7 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);

int num_bits = log2i(num_csr_cols - 1);
int num_bits = common::ilog2ul(num_csr_cols - 1);
SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);

// Scatter csr indptr to row id
Expand Down Expand Up @@ -699,7 +693,7 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(total_temp_bytes), s);

int num_bits = log2i(num_cols_l - 1);
int num_bits = common::ilog2ul(num_cols_l - 1);
SortByKey(col_idx_copy, original_idx, true, &temp_storage, 0, num_bits);

// over-allocate aux indices
Expand Down Expand Up @@ -1046,7 +1040,7 @@ inline void DotDnsCsrDnsImpl(const OpContext& ctx, const gpu& gpu_dev,
Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);

int num_bits = log2i(num_csr_cols - 1);
int num_bits = common::ilog2ul(num_csr_cols - 1);
SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);

// Scatter csr indptr to row id
Expand Down
2 changes: 1 addition & 1 deletion src/operator/tensor/indexing_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0),
static_cast<dim_t>(1), kWriteTo, original_idx);
// sort data with its original idx
int num_bits = ilog2(num_rows - 1);
int num_bits = common::ilog2ui(num_rows - 1);
char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
Shape1(sort_workspace_size), s);
Expand Down
14 changes: 4 additions & 10 deletions src/operator/tensor/indexing_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "./sort_op.h"
#include "./init_op.h"
#include "../../engine/openmp.h"
#include "../../common/utils.h"
#ifdef __CUDACC__
#include "./indexing_op-inl.cuh"
#endif
Expand Down Expand Up @@ -528,13 +529,6 @@ void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
}
}

// Returns integer log2(a) rounded up
inline int ilog2(unsigned int a) {
int k = 1;
while (a >>= 1) k++;
return k;
}

/*! \brief cast to type and clip to range [0, K - 1]
*/
struct tcast_clip {
Expand Down Expand Up @@ -583,7 +577,7 @@ void AddTakeGradLargeBatchCaller(const OpContext& ctx, mshadow::Tensor<xpu, 2, D
static_cast<int>(dst.shape_[0]));
Kernel<range_fwd, xpu>::Launch(s, index.shape_.Size(),
1, 0, 1, kWriteTo, original_index.dptr_);
int num_bits = ilog2((dst.shape_[0] - 1));
int num_bits = common::ilog2ui((dst.shape_[0] - 1));
mxnet::op::SortByKey(sorted_data, original_index, true, &temp_storage, 0, num_bits);
mxnet::op::AddTakeGradLargeBatch(dst, sorted_data, original_index, src, &temp_storage);
}
Expand Down Expand Up @@ -964,7 +958,7 @@ void TakeOpBackwardImpl(mshadow::Stream<cpu>* s,
s, idxshape.Size(), sorted_idx_ptr, sorted_idx_ptr, static_cast<int>(arrshape[axis]));
}
Tensor<cpu, 1, int> original_idx(original_idx_ptr, Shape1(idxshape.Size()), s);
int num_bits = ilog2(static_cast<unsigned int>(idxshape.Size()) - 1);
int num_bits = common::ilog2ui(static_cast<unsigned int>(idxshape.Size()) - 1);
Tensor<cpu, 1, int> sorted_idx(sorted_idx_ptr, Shape1(idxshape.Size()), s);
SortByKey(sorted_idx, original_idx, true, &temp_storage, 0, num_bits);
for (size_t i = 0; i < idxshape.Size(); ++i) {
Expand Down Expand Up @@ -1058,7 +1052,7 @@ void TakeOpBackwardImpl(mshadow::Stream<gpu>* s,
}
Tensor<gpu, 1, int> original_idx(original_idx_ptr, Shape1(idxshape.Size()), s);
Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);
int num_bits = ilog2(static_cast<unsigned int>(idxshape.Size()) - 1);
int num_bits = common::ilog2ui(static_cast<unsigned int>(idxshape.Size()) - 1);
Tensor<gpu, 1, int> sorted_idx(sorted_idx_ptr, Shape1(idxshape.Size()), s);
SortByKey(sorted_idx, original_idx, true, &temp_storage, 0, num_bits);
cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
Expand Down

0 comments on commit 437da0e

Please sign in to comment.