NVIDIA · miscco · Jun 12, 2024 · Jun 7, 2024 · Jun 10, 2024 · Jun 10, 2024
@@ -15,3 +15,4 @@ Extended API
    extended_api/functional
    extended_api/streams
    extended_api/memory_resource
+   extended_api/math
@@ -0,0 +1,52 @@
+.. _libcudacxx-extended-api-math:
+
+Math
+=====
+
+.. code:: cuda
+
+   template <typename T>
+   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;
+
+ceil_div
+---------
+
+- _Requires_: `is_integral_v<T>` is true.
+- _Preconditions_: `a >= 0` is true and `b > 0` is true.
+- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value.
+
+.. note::
+
+   The function is only constexpr from C++14 onwards
+
+**Example**: This API is very useful for determining the _number of thread blocks_ required to process a fixed amount of work, given a fixed number of threads per block:
+
+.. code:: cuda
+
+   #include <vector>
+   #include <cuda/cmath>
+
+   __global__ void vscale(int n, float s, float *x) {
+     int i = blockIdx.x * blockDim.x + threadIdx.x;
+     if (i < n) x[i] *= s;
+   }
+
+   int main() {
+     const int n = 100000;
+     const float s = 2.f;
+     std::vector<float> x(n, 1.f);
+
+     // Given a fixed number of threads per block...
+     constexpr int threads_per_block = 256;
+
+     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
+     // requiring the kernel to be launched with an extra thread block to handle it.
+     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
+
+     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
+     cudaDeviceSynchronize();
+
+     return 0;
+   }
+
+`See it on Godbolt TODO`
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_CMATH
+#define _CUDA_CMATH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/cmath>
+#include <cuda/std/detail/libcxx/include/__debug>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <class _Tp, _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Tp __b) noexcept
+{
+  _LIBCUDACXX_DEBUG_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
+  _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
+  const _Tp __res = static_cast<_Tp>(__a / __b);
+  return static_cast<_Tp>(__res + (__res * __b != __a));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA_CMATH
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/cmath>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+#if !defined(TEST_COMPILER_NVRTC)
+#  include <cstdint>
+#endif // !TEST_COMPILER_NVRTC
+
+template <class T>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void test()
+{
+  constexpr T maxv = cuda::std::numeric_limits<T>::max();
+
+  assert(cuda::ceil_div(T(0), T(1)) == T(0));
+  assert(cuda::ceil_div(T(1), T(1)) == T(1));
+  assert(cuda::ceil_div(T(126), T(64)) == T(2));
+
+  // ensure that we are resilient against overflow
+  assert(cuda::ceil_div(maxv, T(1)) == maxv);
+  assert(cuda::ceil_div(maxv, maxv) == T(1));
+}
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
+{
+  // Builtin integer types:
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+
+  test<short>();
+  test<unsigned short>();
+
+  test<int>();
+  test<unsigned int>();
+
+  test<long>();
+  test<unsigned long>();
+
+  test<long long>();
+  test<unsigned long long>();
+
+#if !defined(TEST_COMPILER_NVRTC)
+  // cstdint types:
+  test<std::size_t>();
+  test<std::ptrdiff_t>();
+  test<std::intptr_t>();
+  test<std::uintptr_t>();
+
+  test<std::int8_t>();
+  test<std::int16_t>();
+  test<std::int32_t>();
+  test<std::int64_t>();
+
+  test<std::uint8_t>();
+  test<std::uint16_t>();
+  test<std::uint32_t>();
+  test<std::uint64_t>();
+#endif // !TEST_COMPILER_NVRTC
+
+#if !defined(TEST_HAS_NO_INT128_T)
+  test<__int128_t>();
+  test<__uint128_t>();
+#endif // !TEST_HAS_NO_INT128_T
+
+  return true;
+}
+
+int main(int arg, char** argv)
+{
+  test();
+#if TEST_STD_VER >= 2014
+  static_assert(test(), "");
+#endif // TEST_STD_VER >= 2014
+  return 0;
+}