Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds ceil_div #1825

Merged
merged 12 commits into from
Jun 12, 2024
1 change: 1 addition & 0 deletions docs/libcudacxx/extended_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ Extended API
extended_api/functional
extended_api/streams
extended_api/memory_resource
extended_api/math
52 changes: 52 additions & 0 deletions docs/libcudacxx/extended_api/math.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
.. _libcudacxx-extended-api-math:

Math
=====

.. code:: cuda

template <typename T>
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;

ceil_div
---------

- _Requires_: `is_integral_v<T>` is true.
- _Preconditions_: `a >= 0` is true and `b > 0` is true.
- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value.

.. note::

The function is only constexpr from C++14 onwards

**Example**: This API is very useful for determining the _number of thread blocks_ required to process a fixed amount of work, given a fixed number of threads per block:

.. code:: cuda

#include <vector>
#include <cuda/cmath>

__global__ void vscale(int n, float s, float *x) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) x[i] *= s;
}

int main() {
const int n = 100000;
const float s = 2.f;
std::vector<float> x(n, 1.f);

// Given a fixed number of threads per block...
constexpr int threads_per_block = 256;

// ...dividing some "n" by "threads_per_block" may lead to a remainder,
// requiring the kernel to be launched with an extra thread block to handle it.
const int thread_blocks = cuda::ceil_div(n, threads_per_block);

vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
cudaDeviceSynchronize();

return 0;
}

`See it on Godbolt TODO`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remark: unadressed TODO. I guess we want to fix this when ceil_div becomes available on godbolt with the next CCCL release? But we should maybe create an issue.

42 changes: 42 additions & 0 deletions libcudacxx/include/cuda/cmath
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_CMATH
#define _CUDA_CMATH

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/cmath>
#include <cuda/std/detail/libcxx/include/__debug>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <class _Tp, _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0>
_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Tp __b) noexcept
{
_LIBCUDACXX_DEBUG_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
_LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
const _Tp __res = static_cast<_Tp>(__a / __b);
return static_cast<_Tp>(__res + (__res * __b != __a));
}

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA_CMATH
88 changes: 88 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/cmath.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/cmath>
#include <cuda/std/cassert>
#include <cuda/std/cstddef>
#include <cuda/std/limits>
#include <cuda/std/utility>

#include "test_macros.h"

#if !defined(TEST_COMPILER_NVRTC)
# include <cstdint>
#endif // !TEST_COMPILER_NVRTC

template <class T>
__host__ __device__ TEST_CONSTEXPR_CXX14 void test()
{
constexpr T maxv = cuda::std::numeric_limits<T>::max();

assert(cuda::ceil_div(T(0), T(1)) == T(0));
assert(cuda::ceil_div(T(1), T(1)) == T(1));
assert(cuda::ceil_div(T(126), T(64)) == T(2));

// ensure that we are resilient against overflow
assert(cuda::ceil_div(maxv, T(1)) == maxv);
assert(cuda::ceil_div(maxv, maxv) == T(1));
}

__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
{
// Builtin integer types:
test<char>();
test<signed char>();
test<unsigned char>();

test<short>();
test<unsigned short>();

test<int>();
test<unsigned int>();

test<long>();
test<unsigned long>();

test<long long>();
test<unsigned long long>();

#if !defined(TEST_COMPILER_NVRTC)
// cstdint types:
test<std::size_t>();
test<std::ptrdiff_t>();
test<std::intptr_t>();
test<std::uintptr_t>();

test<std::int8_t>();
test<std::int16_t>();
test<std::int32_t>();
test<std::int64_t>();

test<std::uint8_t>();
test<std::uint16_t>();
test<std::uint32_t>();
test<std::uint64_t>();
#endif // !TEST_COMPILER_NVRTC

#if !defined(TEST_HAS_NO_INT128_T)
test<__int128_t>();
test<__uint128_t>();
#endif // !TEST_HAS_NO_INT128_T

return true;
}

int main(int arg, char** argv)
{
test();
#if TEST_STD_VER >= 2014
static_assert(test(), "");
#endif // TEST_STD_VER >= 2014
return 0;
}
Loading