From f2a377757631e3c02ce507abb1ad3e44e0d566e5 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Jul 2025 13:54:51 +0200 Subject: [PATCH 001/221] Allow CUfunction (driver API) in the cuda_kernel(_chain) API --- .../__stf/internal/cuda_kernel_scope.cuh | 130 ++++++++++++++---- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 79beb07dc03..3e5018062cb 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -47,7 +47,7 @@ struct cuda_kernel_desc { template cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) - : func((const void*) func) + : func_variant(store_func(mv(func))) , gridDim(gridDim_) , blockDim(blockDim_) , sharedMem(sharedMem_) @@ -57,16 +57,19 @@ struct cuda_kernel_desc // We first copy all arguments into a tuple because the kernel // implementation needs pointers to the argument, so we cannot use // directly those passed in the pack of arguments - auto arg_tuple = ::std::make_shared(std::forward(args)...); + auto arg_tuple = ::std::make_shared(mv(args)...); - // Ensure we are packing arguments of the proper types to call func - static_assert(::std::is_invocable_v); + // Ensure we are packing arguments of the proper types to call func (only + // valid with the runtime API) + if constexpr (!::std::is_same_v) { + static_assert(::std::is_invocable_v); + } // Get the address of every tuple entry ::std::apply( [this](auto&... elems) { // Push back the addresses of each tuple element into the args vector - ((args_ptr.push_back(static_cast(&elems))), ...); + ((args_ptr.push_back(&elems)), ...); }, *arg_tuple); @@ -74,17 +77,99 @@ struct cuda_kernel_desc arg_tuple_type_erased = mv(arg_tuple); } - /* __global__ function */ - const void* func; + /* CUfunction (CUDA driver API) or __global__ function (CUDA runtime API) */ + using func_variant_t = ::std::variant; + func_variant_t func_variant; dim3 gridDim; dim3 blockDim; - size_t sharedMem; + size_t sharedMem = 0; // Vector of pointers to the arg_tuple which saves arguments in a typed-erased way - ::std::vector args_ptr; + // Mutable so that launch can be const + mutable ::std::vector args_ptr; + + // Helper to launch the kernel using CUDA stream based API + void launch(cudaStream_t stream) const + { + ::std::visit( + [&](auto&& kernel_func) { + using T = ::std::decay_t; + if constexpr (::std::is_same_v) + { + cuda_safe_call(cudaLaunchKernel(kernel_func, gridDim, blockDim, args_ptr.data(), sharedMem, stream)); + } + else + { + static_assert(::std::is_same_v, "Unsupported function type in func_variant"); + cuda_safe_call(cuLaunchKernel( + kernel_func, + gridDim.x, + gridDim.y, + gridDim.z, + blockDim.x, + blockDim.y, + blockDim.z, + sharedMem, + stream, + args_ptr.data(), + nullptr)); + } + }, + func_variant); + } + + void launch_in_graph(cudaGraphNode_t& node, cudaGraph_t& graph) const + { + ::std::visit( + [&](auto&& kernel_func) { + using T = ::std::decay_t; + + if constexpr (::std::is_same_v) + { + CUDA_KERNEL_NODE_PARAMS params{ + .func = kernel_func, + .gridDimX = gridDim.x, + .gridDimY = gridDim.y, + .gridDimZ = gridDim.z, + .blockDimX = blockDim.x, + .blockDimY = blockDim.y, + .blockDimZ = blockDim.z, + .sharedMemBytes = static_cast(sharedMem), + .kernelParams = const_cast(args_ptr.data()), + .extra = nullptr, + .kern = nullptr, + .ctx = nullptr}; + cuda_safe_call(cuGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms)); + } + else + { + static_assert(::std::is_same_v, "Unsupported kernel function type"); + cudaKernelNodeParams params{ + .func = const_cast(kernel_func), + .gridDim = gridDim, + .blockDim = blockDim, + .sharedMemBytes = static_cast(sharedMem), + .kernelParams = args_ptr.data(), + .extra = nullptr}; + cuda_safe_call(cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms)); + } + }, + func_variant); + } private: ::std::shared_ptr arg_tuple_type_erased; + + static func_variant_t store_func(CUfunction f) + { + return f; + } + + template + static func_variant_t store_func(T* f) + { + return reinterpret_cast(f); + } }; namespace reserved @@ -252,7 +337,7 @@ public: // graph, or we rely on a child graph if (res.size() == 1) { - insert_one_kernel(res[0], t.get_node(), g); + res[0].launch_in_graph(t.get_node(), g); } else { @@ -262,7 +347,7 @@ public: // Create a chain of kernels for (size_t i = 0; i < res.size(); i++) { - insert_one_kernel(res[i], chain[i], g); + res[i].launch_in_graph(chain[i], g); if (i > 0) { cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1)); @@ -275,8 +360,7 @@ public: // Rely on stream semantic to have a dependency between the kernels for (auto& k : res) { - cuda_safe_call( - cudaLaunchKernel(k.func, k.gridDim, k.blockDim, k.args_ptr.data(), k.sharedMem, t.get_stream())); + k.launch(t.get_stream()); } } } @@ -287,35 +371,21 @@ public: // descriptor, not a vector static_assert(!chained); - cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); + cuda_kernel_desc res = ::cuda::std::apply(f, deps.instance(t)); if constexpr (::std::is_same_v) { auto lock = t.lock_ctx_graph(); - insert_one_kernel(res, t.get_node(), t.get_ctx_graph()); + res.launch_in_graph(t.get_node(), t.get_ctx_graph()); } else { - cuda_safe_call( - cudaLaunchKernel(res.func, res.gridDim, res.blockDim, res.args_ptr.data(), res.sharedMem, t.get_stream())); + res.launch(t.get_stream()); } } } private: - /* Add a kernel to a CUDA graph given its description */ - auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const - { - cudaKernelNodeParams kconfig; - kconfig.blockDim = k.blockDim; - kconfig.extra = nullptr; - kconfig.func = const_cast(k.func); - kconfig.gridDim = k.gridDim; - kconfig.kernelParams = k.args_ptr.data(); - kconfig.sharedMemBytes = k.sharedMem; - cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); - } - ::std::string symbol; Ctx& ctx; // Statically defined deps From b3304a1865338f5ceccb0c78b03b078853bfe474 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Jul 2025 13:57:23 +0200 Subject: [PATCH 002/221] clang-format --- .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 3e5018062cb..9abfeb47e31 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -61,8 +61,9 @@ struct cuda_kernel_desc // Ensure we are packing arguments of the proper types to call func (only // valid with the runtime API) - if constexpr (!::std::is_same_v) { - static_assert(::std::is_invocable_v); + if constexpr (!::std::is_same_v) + { + static_assert(::std::is_invocable_v); } // Get the address of every tuple entry From 8651e9fa530b4e3a7e8afccde2a4127a7385a73e Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Jul 2025 14:15:42 +0200 Subject: [PATCH 003/221] We have a std::tuple not a cuda::std::tuple (yet) --- .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 9abfeb47e31..7bbe6e12228 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -372,7 +372,7 @@ public: // descriptor, not a vector static_assert(!chained); - cuda_kernel_desc res = ::cuda::std::apply(f, deps.instance(t)); + cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); if constexpr (::std::is_same_v) { From fd42c70cdfd3ec6019c959dfe7747130a0bf36ad Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Jul 2025 16:07:58 +0200 Subject: [PATCH 004/221] If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers used by kernels --- .../__stf/internal/cuda_kernel_scope.cuh | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 7bbe6e12228..9a8b26804e4 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -158,6 +158,27 @@ struct cuda_kernel_desc func_variant); } + // Utility to query the number of registers used by this kernel + int get_num_registers() const + { + return ::std::visit( + [](auto&& kernel_func) { + using T = ::std::decay_t; + if constexpr (::std::is_same_v) + { + return cuda_try(CU_FUNC_ATTRIBUTE_NUM_REGS, kernel_func); + } + else + { + static_assert(::std::is_same_v, "Unsupported kernel function type"); + cudaFuncAttributes func_attr{}; + cuda_safe_call(cudaFuncGetAttributes(&func_attr, kernel_func)); + return func_attr.numRegs; + } + }, + func_variant); + } + private: ::std::shared_ptr arg_tuple_type_erased; @@ -323,12 +344,28 @@ public: dot.template add_vertex(t); } + // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers + // used by the kernel(s) + static bool display_register_cnt = [] { + const char* env = ::std::getenv("CUDASTF_CUDA_KERNEL_DEBUG"); + return env && (atoi(env) != 0); + }(); + // When chained is enable, we expect a vector of kernel description which should be executed one after the other if constexpr (chained) { ::std::vector res = ::std::apply(f, deps.instance(t)); assert(!res.empty()); + if (display_register_cnt) + { + fprintf(stderr, "cuda_kernel_chain (%s):\n", symbol.c_str()); + for (size_t i = 0; i < res.size(); i++) + { + fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, res[i].get_num_registers()); + } + } + if constexpr (::std::is_same_v) { auto lock = t.lock_ctx_graph(); @@ -373,6 +410,10 @@ public: static_assert(!chained); cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); + if (display_register_cnt) + { + fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), res.get_num_registers()); + } if constexpr (::std::is_same_v) { From f36fcd07fa726c5150ab62528cbe145a3627c824 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 16:27:53 +0200 Subject: [PATCH 005/221] Support CUkernel in addition to CUfunction --- .../__stf/internal/cuda_kernel_scope.cuh | 54 +++++++++++++++---- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 9a8b26804e4..1b3dfcd322a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -37,6 +37,28 @@ namespace cuda::experimental::stf class graph_ctx; class stream_ctx; +namespace reserved +{ + +template +struct is_function_or_kernel : ::std::false_type +{}; + +template <> +struct is_function_or_kernel : ::std::true_type +{}; + +#if CUDA_VERSION >= 12000 +template <> +struct is_function_or_kernel : ::std::true_type +{}; +#endif + +template +inline constexpr bool is_function_or_kernel_v = is_function_or_kernel::value; + +} // end namespace reserved + /** * @brief Description of a CUDA kernel * @@ -61,7 +83,7 @@ struct cuda_kernel_desc // Ensure we are packing arguments of the proper types to call func (only // valid with the runtime API) - if constexpr (!::std::is_same_v) + if constexpr (!reserved::is_function_or_kernel_v) { static_assert(::std::is_invocable_v); } @@ -78,8 +100,13 @@ struct cuda_kernel_desc arg_tuple_type_erased = mv(arg_tuple); } - /* CUfunction (CUDA driver API) or __global__ function (CUDA runtime API) */ - using func_variant_t = ::std::variant; + /* CUfunction/CUkernel (CUDA driver API) or __global__ function (CUDA runtime API) */ + using func_variant_t = + ::std::variant= 12000 + CUkernel, +#endif + const void*>; func_variant_t func_variant; dim3 gridDim; dim3 blockDim; @@ -101,9 +128,11 @@ struct cuda_kernel_desc } else { - static_assert(::std::is_same_v, "Unsupported function type in func_variant"); + static_assert(reserved::is_function_or_kernel_v, "Unsupported function type in func_variant"); + + // If this is a CUkernel, the cast to a CUfunction is sufficient cuda_safe_call(cuLaunchKernel( - kernel_func, + (CUfunction) kernel_func, gridDim.x, gridDim.y, gridDim.z, @@ -125,10 +154,10 @@ struct cuda_kernel_desc [&](auto&& kernel_func) { using T = ::std::decay_t; - if constexpr (::std::is_same_v) + if constexpr (reserved::is_function_or_kernel_v) { CUDA_KERNEL_NODE_PARAMS params{ - .func = kernel_func, + .func = (CUfunction) kernel_func, .gridDimX = gridDim.x, .gridDimY = gridDim.y, .gridDimZ = gridDim.z, @@ -164,9 +193,9 @@ struct cuda_kernel_desc return ::std::visit( [](auto&& kernel_func) { using T = ::std::decay_t; - if constexpr (::std::is_same_v) + if constexpr (reserved::is_function_or_kernel_v) { - return cuda_try(CU_FUNC_ATTRIBUTE_NUM_REGS, kernel_func); + return cuda_try(CU_FUNC_ATTRIBUTE_NUM_REGS, (CUfunction) kernel_func); } else { @@ -187,6 +216,13 @@ private: return f; } +#if CUDA_VERSION >= 12000 + static func_variant_t store_func(CUkernel k) + { + return k; + } +#endif + template static func_variant_t store_func(T* f) { From b2002d865663a07322ef6954ed9b2e9cfbc9dbe6 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 16:54:52 +0200 Subject: [PATCH 006/221] Add a test with CUfunction and CUkernel --- cudax/test/stf/CMakeLists.txt | 1 + .../test/stf/examples/cuda_kernels_driver.cu | 90 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 cudax/test/stf/examples/cuda_kernels_driver.cu diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 75b271b5865..e594f686e05 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -20,6 +20,7 @@ set(stf_test_sources error_checks/non_managed_data.cu error_checks/uninitialized_data.cu error_checks/write_frozen.cu + examples/cuda_kernels_driver.cu examples/05-stencil-no-copy.cu examples/05-stencil-places.cu examples/05-stencil.cu diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu new file mode 100644 index 00000000000..be7ae30f629 --- /dev/null +++ b/cudax/test/stf/examples/cuda_kernels_driver.cu @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Test that the cuda_kernel construct works with global kernels, CUfunction and CUkernel entries. + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx; + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + CUfunction axpy_fun; + cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void *)axpy)); + + // TODO ifdef + CUkernel axpy_kernel; + cuda_safe_call(cudaGetKernel(&axpy_kernel, (void *)axpy)); + + // runtime global kernel + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { + // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY) + return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY}; + }; + + // CUfunction driver API + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { + return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY}; + }; + + // CUkernel driver API + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { + return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY}; + }; + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + 3.0*alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} From 22c758a4ef0c2561c87553b17a7c236fedca4b03 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 17:00:55 +0200 Subject: [PATCH 007/221] Check whether CUkernel is supported --- cudax/test/stf/examples/cuda_kernels_driver.cu | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu index be7ae30f629..ec2ff2ee681 100644 --- a/cudax/test/stf/examples/cuda_kernels_driver.cu +++ b/cudax/test/stf/examples/cuda_kernels_driver.cu @@ -52,39 +52,47 @@ int main() Y[i] = Y0(i); } + // Number of times we have applied the axpy kernel + int num_axpy = 0; + double alpha = 3.14; auto lX = ctx.logical_data(X); auto lY = ctx.logical_data(Y); CUfunction axpy_fun; - cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void *)axpy)); + cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy)); // TODO ifdef CUkernel axpy_kernel; - cuda_safe_call(cudaGetKernel(&axpy_kernel, (void *)axpy)); + cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy)); // runtime global kernel ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY) return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY}; }; + num_axpy++; // CUfunction driver API ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY}; }; + num_axpy++; +#if CUDA_VERSION >= 12000 // CUkernel driver API ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY}; }; + num_axpy++; +#endif ctx.finalize(); for (size_t i = 0; i < N; i++) { - assert(fabs(Y[i] - (Y0(i) + 3.0*alpha * X0(i))) < 0.0001); + assert(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001); assert(fabs(X[i] - X0(i)) < 0.0001); } } From 02ded8b60d01357da12444a43be7b7e4f5324352 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 18:03:13 +0200 Subject: [PATCH 008/221] use _CCCL_ASSERT instead of assert to avoid an unused variable error --- cudax/test/stf/examples/cuda_kernels_driver.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu index ec2ff2ee681..4100541f482 100644 --- a/cudax/test/stf/examples/cuda_kernels_driver.cu +++ b/cudax/test/stf/examples/cuda_kernels_driver.cu @@ -92,7 +92,7 @@ int main() for (size_t i = 0; i < N; i++) { - assert(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001); - assert(fabs(X[i] - X0(i)) < 0.0001); + _CCCL_ASSERT(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001, "Invalid result"); + _CCCL_ASSERT(fabs(X[i] - X0(i)) < 0.0001, "Invalid result"); } } From ebeb7031cdba7498a3e1df467accd8e115998dfb Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 18:32:16 +0200 Subject: [PATCH 009/221] cudaGetKernel was added in CUDA 12.1 --- cudax/test/stf/examples/cuda_kernels_driver.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu index 4100541f482..76ae872f71b 100644 --- a/cudax/test/stf/examples/cuda_kernels_driver.cu +++ b/cudax/test/stf/examples/cuda_kernels_driver.cu @@ -60,13 +60,6 @@ int main() auto lX = ctx.logical_data(X); auto lY = ctx.logical_data(Y); - CUfunction axpy_fun; - cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy)); - - // TODO ifdef - CUkernel axpy_kernel; - cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy)); - // runtime global kernel ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY) @@ -74,14 +67,21 @@ int main() }; num_axpy++; + // CUfunction driver API + CUfunction axpy_fun; + cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy)); + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY}; }; num_axpy++; -#if CUDA_VERSION >= 12000 +#if CUDA_VERSION >= 12010 // CUkernel driver API + CUkernel axpy_kernel; + cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy)); + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY}; }; From 2f298b7beb6fc1aecb5e0ee9401ae35c13ecfe36 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 18:33:44 +0200 Subject: [PATCH 010/221] clang-format --- cudax/test/stf/examples/cuda_kernels_driver.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu index 76ae872f71b..6443cbdd867 100644 --- a/cudax/test/stf/examples/cuda_kernels_driver.cu +++ b/cudax/test/stf/examples/cuda_kernels_driver.cu @@ -67,7 +67,6 @@ int main() }; num_axpy++; - // CUfunction driver API CUfunction axpy_fun; cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy)); From 3d9b7a5bafecafe0cee41a1c282fc76461798d85 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 08:42:25 +0200 Subject: [PATCH 011/221] Extract the start and end phase of the ->* operator --- .../__stf/internal/cuda_kernel_scope.cuh | 132 +++++++++++------- 1 file changed, 83 insertions(+), 49 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 1b3dfcd322a..d34955ab50c 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -209,6 +209,8 @@ struct cuda_kernel_desc } private: + // This type-erased smart pointer keeps the argument tuple valid until the + // object is destroyed, so that the pointer to these arguments remain valid ::std::shared_ptr arg_tuple_type_erased; static func_variant_t store_func(CUfunction f) @@ -292,17 +294,13 @@ public: return *this; } - /** - * @brief Takes a lambda function and executes it on the host in a graph callback node. - * - * @tparam Fun type of lambda function - * @param f Lambda function to execute - */ - template - void operator->*(Fun&& f) + auto& start() { // If a place is specified, use it - auto t = e_place ? ctx.task(e_place.value()) : ctx.task(); + support_task = e_place ? ctx.task(e_place.value()) : ctx.task(); + + // Short-hand for more readable code + auto& t = *support_task; // So that we can use get to retrieve dynamic dependencies untyped_t = t; @@ -321,53 +319,18 @@ public: t.set_symbol(symbol); } - auto& dot = *ctx.get_dot(); - auto& statistics = reserved::task_statistics::instance(); - - cudaEvent_t start_event, end_event; - const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + // Do we need to measure the duration of the kernel(s) ? + auto& statistics = reserved::task_statistics::instance(); + record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + record_time_device = -1; t.start(); - int device = -1; - - SCOPE(exit) - { - t.end_uncleared(); - - if constexpr (::std::is_same_v) - { - if (record_time) - { - cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); - cuda_safe_call(cudaEventSynchronize(end_event)); - - float milliseconds = 0; - cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); - - if (dot.is_tracing()) - { - dot.template add_vertex_timing(t, milliseconds, device); - } - - if (statistics.is_calibrating()) - { - statistics.log_task_time(t, milliseconds); - } - } - } - - t.clear(); - - // Now that we have executed 'f', we do not need to access it anymore - untyped_t.reset(); - }; - if constexpr (::std::is_same_v) { if (record_time) { - cuda_safe_call(cudaGetDevice(&device)); // We will use this to force it during the next run + cuda_safe_call(cudaGetDevice(&record_time_device)); // We will use this to force it during the next run // Events must be created here to avoid issues with multi-gpu cuda_safe_call(cudaEventCreate(&start_event)); cuda_safe_call(cudaEventCreate(&end_event)); @@ -375,11 +338,71 @@ public: } } + auto& dot = *ctx.get_dot(); if (dot.is_tracing()) { dot.template add_vertex(t); } + return *this; + } + + auto& end() + { + auto& t = *support_task; + + // We need to access the task structures (eg. to get the stream) so we do + // not clear all its resources yet. + t.end_uncleared(); + + if constexpr (::std::is_same_v) + { + if (record_time) + { + cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); + cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + auto& dot = *ctx.get_dot(); + if (dot.is_tracing()) + { + dot.template add_vertex_timing(t, milliseconds, record_time_device); + } + + auto& statistics = reserved::task_statistics::instance(); + if (statistics.is_calibrating()) + { + statistics.log_task_time(t, milliseconds); + } + } + } + + t.clear(); + + // Now that we have executed 'f', we do not need to access it anymore + untyped_t.reset(); + + return *this; + } + + /** + * @brief Takes a lambda function and executes it on the host in a graph callback node. + * + * @tparam Fun type of lambda function + * @param f Lambda function to execute + */ + template + void operator->*(Fun&& f) + { + start(); + + SCOPE(exit) + { + end(); + }; + // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers // used by the kernel(s) static bool display_register_cnt = [] { @@ -387,6 +410,8 @@ public: return env && (atoi(env) != 0); }(); + auto& t = *support_task; + // When chained is enable, we expect a vector of kernel description which should be executed one after the other if constexpr (chained) { @@ -469,12 +494,21 @@ private: // Statically defined deps task_dep_vector deps; + // To store a task that implements cuda_kernel(_chain) + using underlying_task_type = decltype(::std::declval().task()); + ::std::optional support_task; + // Dependencies added with add_deps ::std::vector dynamic_deps; // Used to retrieve deps with t.get<>(...) ::std::optional untyped_t; ::std::optional e_place; + + // Are we making some measurements ? + bool record_time; + int record_time_device; + cudaEvent_t start_event, end_event; }; } // end namespace reserved From ac92c82e33e13fa1887317d99a86700338c2a847 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 08:54:51 +0200 Subject: [PATCH 012/221] There is no need to store untyped_t as we now store the task with its type --- .../__stf/internal/cuda_kernel_scope.cuh | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index d34955ab50c..7f9267b0aa3 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -275,8 +275,8 @@ public: template decltype(auto) get(size_t submitted_index) const { - _CCCL_ASSERT(untyped_t.has_value(), "uninitialized task"); - return untyped_t->template get(submitted_index); + _CCCL_ASSERT(support_task.has_value(), "uninitialized task"); + return support_task->template get(submitted_index); } /** @@ -299,12 +299,8 @@ public: // If a place is specified, use it support_task = e_place ? ctx.task(e_place.value()) : ctx.task(); - // Short-hand for more readable code auto& t = *support_task; - // So that we can use get to retrieve dynamic dependencies - untyped_t = t; - t.add_deps(deps); // Append all dynamic deps @@ -381,8 +377,9 @@ public: t.clear(); - // Now that we have executed 'f', we do not need to access it anymore - untyped_t.reset(); + // Do release to the task structure as we don't need to reference it when + // we have called end() + support_task.reset(); return *this; } @@ -494,14 +491,14 @@ private: // Statically defined deps task_dep_vector deps; - // To store a task that implements cuda_kernel(_chain) + // To store a task that implements cuda_kernel(_chain). Note that we do not + // store the task with Deps... but a "dynamic" task where all dependencies + // are added using add_deps. using underlying_task_type = decltype(::std::declval().task()); ::std::optional support_task; // Dependencies added with add_deps ::std::vector dynamic_deps; - // Used to retrieve deps with t.get<>(...) - ::std::optional untyped_t; ::std::optional e_place; From 5017413eff7de4a40a3956c8b97fdb41d4488d95 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 09:38:22 +0200 Subject: [PATCH 013/221] Implement the low level interface for cuda_kernel(_chain) with a way to avoid using the ->* operator --- .../experimental/__stf/internal/context.cuh | 25 ++++ .../__stf/internal/cuda_kernel_scope.cuh | 139 +++++++++++------- cudax/test/stf/CMakeLists.txt | 1 + .../cuda_kernel_chain-add_deps_low_level.cu | 86 +++++++++++ 4 files changed, 195 insertions(+), 56 deletions(-) create mode 100644 cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index e04b57ade57..00362b67a1d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -127,6 +127,15 @@ class context return *this; } + template + auto& add_kernel_desc(Args&&... args) + { + payload->*[&](auto& self) { + self.add_kernel_desc(::std::forward(args)...); + }; + return *this; + } + template decltype(auto) get(size_t submitted_index) const { @@ -135,6 +144,22 @@ class context }; } + auto& start() + { + payload->*[&](auto& self) { + self.start(); + }; + return *this; + } + + auto& end() + { + payload->*[&](auto& self) { + self.end(); + }; + return *this; + } + private: ::std::variant payload; }; diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index 7f9267b0aa3..a8fdffb2e8a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -347,6 +347,9 @@ public: { auto& t = *support_task; + // Do submit kernels + launch_kernels(); + // We need to access the task structures (eg. to get the stream) so we do // not clear all its resources yet. t.end_uncleared(); @@ -400,6 +403,49 @@ public: end(); }; + auto& t = *support_task; + + // Get the vector of kernel(s) to perform + // When chained is enable, we expect a vector of kernel description which + // should be executed one after the other. + if constexpr (chained) + { + kernel_descs = ::std::apply(f, deps.instance(t)); + assert(!kernel_descs.empty()); + } + else + { + // We have an unchained cuda_kernel, which means there is a single + // CUDA kernel described, and the function should return a single + // descriptor, not a vector + static_assert(!chained); + + cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); + kernel_descs.push_back(res); + } + } + + // Manually add one kernel + auto& add_kernel_desc(cuda_kernel_desc d) + { + kernel_descs.push_back(mv(d)); + return *this; + } + + // Manually add a vector of kernels + auto& add_kernel_desc(const ::std::vector& descs) + { + for (const auto& d : descs) + { + add_kernel_desc(d); + } + return *this; + } + +private: + // This does submit all kernels and print statistics if needed + void launch_kernels() + { // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers // used by the kernel(s) static bool display_register_cnt = [] { @@ -407,85 +453,62 @@ public: return env && (atoi(env) != 0); }(); - auto& t = *support_task; - - // When chained is enable, we expect a vector of kernel description which should be executed one after the other - if constexpr (chained) + // Print some statistics if needed + if (display_register_cnt) { - ::std::vector res = ::std::apply(f, deps.instance(t)); - assert(!res.empty()); - - if (display_register_cnt) + if (kernel_descs.size() > 1) { fprintf(stderr, "cuda_kernel_chain (%s):\n", symbol.c_str()); - for (size_t i = 0; i < res.size(); i++) + for (size_t i = 0; i < kernel_descs.size(); i++) { - fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, res[i].get_num_registers()); + fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, kernel_descs[i].get_num_registers()); } } - - if constexpr (::std::is_same_v) + else { - auto lock = t.lock_ctx_graph(); - auto& g = t.get_ctx_graph(); + fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), kernel_descs[0].get_num_registers()); + } + } - // We have two situations : either there is a single kernel and we put the kernel in the context's - // graph, or we rely on a child graph - if (res.size() == 1) - { - res[0].launch_in_graph(t.get_node(), g); - } - else - { - ::std::vector& chain = t.get_node_chain(); - chain.resize(res.size()); + auto& t = *support_task; - // Create a chain of kernels - for (size_t i = 0; i < res.size(); i++) - { - res[i].launch_in_graph(chain[i], g); - if (i > 0) - { - cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1)); - } - } - } + if constexpr (::std::is_same_v) + { + auto lock = t.lock_ctx_graph(); + auto& g = t.get_ctx_graph(); + + // We have two situations : either there is a single kernel and we put the kernel in the context's + // graph, or we rely on a child graph + if (kernel_descs.size() == 1) + { + kernel_descs[0].launch_in_graph(t.get_node(), g); } else { - // Rely on stream semantic to have a dependency between the kernels - for (auto& k : res) + ::std::vector& chain = t.get_node_chain(); + chain.resize(kernel_descs.size()); + + // Create a chain of kernels + for (size_t i = 0; i < kernel_descs.size(); i++) { - k.launch(t.get_stream()); + kernel_descs[i].launch_in_graph(chain[i], g); + if (i > 0) + { + cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1)); + } } } } else { - // We have an unchained cuda_kernel, which means there is a single - // CUDA kernel described, and the function should return a single - // descriptor, not a vector - static_assert(!chained); - - cuda_kernel_desc res = ::std::apply(f, deps.instance(t)); - if (display_register_cnt) + // Rely on stream semantic to have a dependency between the kernels + for (auto& k : kernel_descs) { - fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), res.get_num_registers()); - } - - if constexpr (::std::is_same_v) - { - auto lock = t.lock_ctx_graph(); - res.launch_in_graph(t.get_node(), t.get_ctx_graph()); - } - else - { - res.launch(t.get_stream()); + k.launch(t.get_stream()); } } } -private: ::std::string symbol; Ctx& ctx; // Statically defined deps @@ -502,6 +525,10 @@ private: ::std::optional e_place; + // What kernel(s) must be done ? We also store this in a vector if there is a + // single kernel (with the cuda_kernel construct) + ::std::vector kernel_descs; + // Are we making some measurements ? bool record_time; int record_time_device; diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index e594f686e05..dfa935f8862 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -34,6 +34,7 @@ set(stf_test_sources graph/static_graph_ctx.cu hashtable/test.cu interface/cuda_kernel_chain-add_deps.cu + interface/cuda_kernel_chain-add_deps_low_level.cu interface/data_from_device_async.cu interface/move_operator.cu local_stf/legacy_to_stf.cu diff --git a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu new file mode 100644 index 00000000000..95a50c151ea --- /dev/null +++ b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Example of task implementing a chain of CUDA kernels with dynamic dependencies (add_deps) + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < x.size(); i += nthreads) + { + y(i) += a * x(i); + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx = graph_ctx(); + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + double alpha = 3.14; + double beta = 4.5; + double gamma = -4.1; + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + /* Compute Y = Y + alpha X, Y = Y + beta X and then Y = Y + gamma X */ + auto t = ctx.cuda_kernel_chain(); + t.add_deps(lX.read()); + t.add_deps(lY.rw()); + t.start(); + auto dX = t.template get>(0); + auto dY = t.template get>(1); + // clang-format off + auto descs = ::std::vector { + { axpy, 16, 128, 0, alpha, dX, dY }, + { axpy, 16, 128, 0, beta, dX, dY }, + { axpy, 16, 128, 0, gamma, dX, dY } + }; + // clang-format on + t.add_kernel_desc(descs); + t.end(); + + ctx.finalize(); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } +} From 1dd1ff50b76d4a1edc300070324cf1343e8df166 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 12:48:26 +0200 Subject: [PATCH 014/221] - Add a test to ensure we can put no arguments in the cuda_kernel_desc constructor - Implement a low level API to describe cuda_kernel_desc with an array of pointers rather than a variadic interface (and use it in a test) --- .../__stf/internal/cuda_kernel_scope.cuh | 46 ++++++++++++++-- cudax/test/stf/CMakeLists.txt | 1 + .../cuda_kernel_chain-add_deps_low_level.cu | 17 +++--- .../stf/interface/cuda_kernel_empty_args.cu | 55 +++++++++++++++++++ 4 files changed, 108 insertions(+), 11 deletions(-) create mode 100644 cudax/test/stf/interface/cuda_kernel_empty_args.cu diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index a8fdffb2e8a..f22ec6a5da0 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -67,15 +67,26 @@ inline constexpr bool is_function_or_kernel_v = is_function_or_kernel::value; */ struct cuda_kernel_desc { + cuda_kernel_desc() = default; + template cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) - : func_variant(store_func(mv(func))) - , gridDim(gridDim_) - , blockDim(blockDim_) - , sharedMem(sharedMem_) + { + configure(mv(func), gridDim_, blockDim_, sharedMem_, ::std::forward(args)...); + } + + template + void configure(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) { using TupleType = ::std::tuple<::std::decay_t...>; + _CCCL_ASSERT(!configured, "cuda_kernel_desc was already configured"); + + func_variant = store_func(mv(func)); + gridDim = gridDim_; + blockDim = blockDim_; + sharedMem = sharedMem_; + // We first copy all arguments into a tuple because the kernel // implementation needs pointers to the argument, so we cannot use // directly those passed in the pack of arguments @@ -98,6 +109,30 @@ struct cuda_kernel_desc // Save the tuple in a typed erased value arg_tuple_type_erased = mv(arg_tuple); + + configured = true; + } + + // It is the responsibility of the caller to unsure arguments are valid until + // the CUDA kernel construct ends + template + void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args) + { + _CCCL_ASSERT(!configured, "cuda_kernel_desc was already configured"); + + func_variant = store_func(mv(func)); + gridDim = gridDim_; + blockDim = blockDim_; + sharedMem = sharedMem_; + + for (int i = 0; i < arg_cnt; i++) + { + // We can safely forget the const here because CUDA will not modify the + // argument + args_ptr.push_back(const_cast(args[i])); + } + + configured = true; } /* CUfunction/CUkernel (CUDA driver API) or __global__ function (CUDA runtime API) */ @@ -230,6 +265,9 @@ private: { return reinterpret_cast(f); } + + // We can only configure the kernel descriptor once + bool configured = false; }; namespace reserved diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index dfa935f8862..eab95ea5e82 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -35,6 +35,7 @@ set(stf_test_sources hashtable/test.cu interface/cuda_kernel_chain-add_deps.cu interface/cuda_kernel_chain-add_deps_low_level.cu + interface/cuda_kernel_empty_args.cu interface/data_from_device_async.cu interface/move_operator.cu local_stf/legacy_to_stf.cu diff --git a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu index 95a50c151ea..e95820b9e60 100644 --- a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu +++ b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu @@ -66,13 +66,16 @@ int main() t.start(); auto dX = t.template get>(0); auto dY = t.template get>(1); - // clang-format off - auto descs = ::std::vector { - { axpy, 16, 128, 0, alpha, dX, dY }, - { axpy, 16, 128, 0, beta, dX, dY }, - { axpy, 16, 128, 0, gamma, dX, dY } - }; - // clang-format on + ::std::vector descs; + descs.resize(3); + // Configure with types + descs[0].configure(axpy, 16, 128, 0, alpha, dX, dY); + descs[1].configure(axpy, 16, 128, 0, beta, dX, dY); + + // Configure with low level API + const void* args[3] = {&gamma, &dX, &dY}; + descs[2].configure_raw(axpy, 16, 128, 0, 3, args); + t.add_kernel_desc(descs); t.end(); diff --git a/cudax/test/stf/interface/cuda_kernel_empty_args.cu b/cudax/test/stf/interface/cuda_kernel_empty_args.cu new file mode 100644 index 00000000000..c9aeb9740cb --- /dev/null +++ b/cudax/test/stf/interface/cuda_kernel_empty_args.cu @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Make sure we can compile with no arguments in CUDA kernel + * + */ + +#include + +using namespace cuda::experimental::stf; + +__global__ void dummy() {} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +int main() +{ + context ctx = graph_ctx(); + const size_t N = 16; + double X[N], Y[N]; + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + auto lX = ctx.logical_data(X); + auto lY = ctx.logical_data(Y); + + // Ensure this works without arguments in the kernel + ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto, auto) { + return cuda_kernel_desc{dummy, 16, 128, 0}; + }; + + ctx.finalize(); +} From acb2b371813f8c9b1bf5b4e40fbb8afe065f769f Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 23:21:43 +0200 Subject: [PATCH 015/221] Start to implement C bindings for CUDASTF (this will later be used in conjunction with cython to generate python support) --- c/CMakeLists.txt | 1 + c/experimental/stf/CMakeLists.txt | 60 +++++++++++++++++++ .../stf/include/cccl/c/experimental/stf/stf.h | 22 +++++++ c/experimental/stf/src/stf.cu | 26 ++++++++ 4 files changed, 109 insertions(+) create mode 100644 c/experimental/stf/CMakeLists.txt create mode 100644 c/experimental/stf/include/cccl/c/experimental/stf/stf.h create mode 100644 c/experimental/stf/src/stf.cu diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 7f1dbf4507b..364494da7a0 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(parallel) +add_subdirectory(experimental/stf/) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt new file mode 100644 index 00000000000..da4985491c7 --- /dev/null +++ b/c/experimental/stf/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.21) + +project(CCCL_C_EXPERIMENTAL_STF LANGUAGES CUDA CXX C) + +option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tests." OFF) + +# FIXME Ideally this would be handled by presets and install rules, but for now +# consumers may override this to control the target location of cccl.c.experimental.stf. +set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library") +mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY) + +file(GLOB_RECURSE srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "src/*.cu" "src/*.cuh" +) + +add_library(cccl.c.experimental.stf SHARED ${srcs}) +set_property(TARGET cccl.c.experimental.stf PROPERTY POSITION_INDEPENDENT_CODE ON) +cccl_configure_target(cccl.c.experimental.stf DIALECT 17) + +# Override the properties set by cccl_configure_target: +if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) + set_target_properties(cccl.c.parallel PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" + ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" + ) +endif() + +find_package(CUDAToolkit REQUIRED) +set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) +target_link_libraries(cccl.c.experimental.stf PRIVATE + CUDA::cudart_static + CUDA::nvrtc + CUDA::nvJitLink + CUDA::cuda_driver + cccl.compiler_interface_cpp20 + cccl.c.parallel.jit_template + CUB::CUB + Thrust::Thrust + nlohmann_json::nlohmann_json +) +# target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) +# target_compile_definitions(cccl.c.experimental.stf PRIVATE +# NVRTC_GET_TYPE_NAME=1 +# CUB_DISABLE_CDP=1 +# CUB_DEFINE_RUNTIME_POLICIES +# ) +target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--extended-lambda>) + +target_include_directories(cccl.c.experimental.stf PUBLIC "include") +target_include_directories(cccl.c.experimental.stf PRIVATE "src") + +if (CCCL_C_Parallel_ENABLE_TESTING) + add_subdirectory(test) +endif() + +if (CCCL_C_Parallel_ENABLE_HEADER_TESTING) + include(cmake/CParallelHeaderTesting.cmake) +endif() diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h new file mode 100644 index 00000000000..126cc9424c7 --- /dev/null +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -0,0 +1,22 @@ +// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END +#ifdef __cplusplus +extern "C" +{ +#endif + +typedef struct stf_ctx_handle stf_ctx_handle; + +void stf_ctx_create(stf_ctx_handle *handle); +void stf_ctx_finalize(stf_ctx_handle *handle); + +struct stf_task_handle { + void *handle; +}; + +struct stf_logical_data_handle { + void *handle; +}; + +#ifdef __cplusplus +} +#endif diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu new file mode 100644 index 00000000000..44a609b25fb --- /dev/null +++ b/c/experimental/stf/src/stf.cu @@ -0,0 +1,26 @@ +#include +//#include +#include + +using namespace cuda::experimental::stf; + +extern "C" +{ + +struct stf_ctx_handle { + context *ctx; +}; + +void stf_ctx_create(stf_ctx_handle *handle) +{ + return new context{}; +} + +void stf_ctx_finalize(stf_ctx_handle *handle) +{ + if (handle) { + handle->finalize(); + } +} + +} From acdbd2c3254312a70b9ec3070be5598c7bb85020 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 23:38:54 +0200 Subject: [PATCH 016/221] clang-format --- .../stf/include/cccl/c/experimental/stf/stf.h | 17 ++++++------- c/experimental/stf/src/stf.cu | 24 +++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 126cc9424c7..c4e2b3321a2 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1,20 +1,21 @@ // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END #ifdef __cplusplus -extern "C" -{ +extern "C" { #endif typedef struct stf_ctx_handle stf_ctx_handle; -void stf_ctx_create(stf_ctx_handle *handle); -void stf_ctx_finalize(stf_ctx_handle *handle); +void stf_ctx_create(stf_ctx_handle* handle); +void stf_ctx_finalize(stf_ctx_handle* handle); -struct stf_task_handle { - void *handle; +struct stf_task_handle +{ + void* handle; }; -struct stf_logical_data_handle { - void *handle; +struct stf_logical_data_handle +{ + void* handle; }; #ifdef __cplusplus diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 44a609b25fb..ea07fe9e173 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -1,26 +1,26 @@ #include -//#include +// #include #include using namespace cuda::experimental::stf; -extern "C" -{ +extern "C" { -struct stf_ctx_handle { - context *ctx; +struct stf_ctx_handle +{ + context* ctx; }; -void stf_ctx_create(stf_ctx_handle *handle) +void stf_ctx_create(stf_ctx_handle* handle) { - return new context{}; + return new context{}; } -void stf_ctx_finalize(stf_ctx_handle *handle) +void stf_ctx_finalize(stf_ctx_handle* handle) { - if (handle) { - handle->finalize(); - } + if (handle) + { + handle->finalize(); + } } - } From 6a0bec1ed3c168264a4a0ddec23ebe2e6c4ee8f2 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 18 Jul 2025 23:45:34 +0200 Subject: [PATCH 017/221] we do not have these tests yet --- c/experimental/stf/CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index da4985491c7..c692f4da2a1 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -51,10 +51,10 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$ Date: Sat, 19 Jul 2025 00:01:01 +0200 Subject: [PATCH 018/221] Misc fixes for cccl.c.experimental.stf --- c/experimental/stf/CMakeLists.txt | 8 +++----- c/experimental/stf/src/stf.cu | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index c692f4da2a1..4d9d5dafcb0 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -34,11 +34,7 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::nvrtc CUDA::nvJitLink CUDA::cuda_driver - cccl.compiler_interface_cpp20 - cccl.c.parallel.jit_template - CUB::CUB - Thrust::Thrust - nlohmann_json::nlohmann_json + CCCL::cudax ) # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) # target_compile_definitions(cccl.c.experimental.stf PRIVATE @@ -46,6 +42,8 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE # CUB_DISABLE_CDP=1 # CUB_DEFINE_RUNTIME_POLICIES # ) + +target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--expt-relaxed-constexpr>) target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--extended-lambda>) target_include_directories(cccl.c.experimental.stf PUBLIC "include") diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index ea07fe9e173..f827481f4e5 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -13,14 +13,14 @@ struct stf_ctx_handle void stf_ctx_create(stf_ctx_handle* handle) { - return new context{}; + handle->ctx = new context{}; } void stf_ctx_finalize(stf_ctx_handle* handle) { if (handle) { - handle->finalize(); + handle->ctx->finalize(); } } } From e2a0d6164fdf7cfd17e0b664600701122dd5a1de Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 19 Jul 2025 00:35:35 +0200 Subject: [PATCH 019/221] Add one test and redefine the ctx handle --- c/experimental/stf/CMakeLists.txt | 8 ++-- .../stf/include/cccl/c/experimental/stf/stf.h | 4 +- c/experimental/stf/src/stf.cu | 15 ++++--- c/experimental/stf/test/CMakeLists.txt | 39 +++++++++++++++++++ c/experimental/stf/test/test_ctx.cpp | 21 ++++++++++ 5 files changed, 73 insertions(+), 14 deletions(-) create mode 100644 c/experimental/stf/test/CMakeLists.txt create mode 100644 c/experimental/stf/test/test_ctx.cpp diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 4d9d5dafcb0..5e8aa8e9f50 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -49,10 +49,10 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$ctx = new context{}; + if (handle) { + *handle = new stf_ctx_handle_t{context{}}; + } } -void stf_ctx_finalize(stf_ctx_handle* handle) +void stf_ctx_finalize(stf_ctx_handle handle) { - if (handle) - { - handle->ctx->finalize(); - } + delete handle; } } diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt new file mode 100644 index 00000000000..f5613253a81 --- /dev/null +++ b/c/experimental/stf/test/CMakeLists.txt @@ -0,0 +1,39 @@ +cccl_get_c2h() + +function(cccl_c_experimental_stf_add_test target_name_var source) + string(REGEX REPLACE "test_([^.]*)" "cccl.c.experimental.stf.test.\\1" target_name "${source}") + set(target_name_var ${target_name} PARENT_SCOPE) + + add_executable(${target_name} "${source}") + cccl_configure_target(${target_name} DIALECT 20) + + set_target_properties(${target_name} PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) + target_link_libraries(${target_name} PRIVATE + cccl.c.experimental.stf + CUDA::cudart_static + CUDA::nvrtc + cccl.c2h.main + cccl.compiler_interface_cpp20 + CUDA::cuda_driver + CCCL::cudax + ) + + target_compile_definitions(${target_name} PRIVATE + TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust" + TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" + TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}" + ) + + add_test(NAME ${target_name} COMMAND ${target_name}) +endfunction() + +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(test_src IN LISTS test_srcs) + cccl_c_experimental_stf_add_test(test_target "${test_src}") +endforeach() diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp new file mode 100644 index 00000000000..6bbfc3d1e46 --- /dev/null +++ b/c/experimental/stf/test/test_ctx.cpp @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +C2H_TEST("basic stf context", "[context]") +{ + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + stf_ctx_finalize(ctx); +} From 7a6ea62e49f2367c21993ee921b7e46cffd52160 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 19 Jul 2025 08:50:43 +0200 Subject: [PATCH 020/221] logical_data wrapper --- .../stf/include/cccl/c/experimental/stf/stf.h | 17 ++++---- c/experimental/stf/src/stf.cu | 34 +++++++++++++--- c/experimental/stf/test/test_logical_data.cpp | 39 +++++++++++++++++++ 3 files changed, 75 insertions(+), 15 deletions(-) create mode 100644 c/experimental/stf/test/test_logical_data.cpp diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index f19a9e75b91..d21cea1563a 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -5,18 +5,15 @@ extern "C" { typedef struct stf_ctx_handle_t* stf_ctx_handle; -void stf_ctx_create(stf_ctx_handle* handle); -void stf_ctx_finalize(stf_ctx_handle handle); +void stf_ctx_create(stf_ctx_handle* ctx); +void stf_ctx_finalize(stf_ctx_handle ctx); -struct stf_task_handle -{ - void* handle; -}; +typedef struct stf_logical_data_handle_t* stf_logical_data_handle; -struct stf_logical_data_handle -{ - void* handle; -}; +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz); +void stf_logical_data_destroy(stf_ctx_handle ctx, stf_logical_data_handle ld); + +typedef struct stf_task_handle_t* stf_task_handle; #ifdef __cplusplus } diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 578757fc13e..f18ad9f22a0 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -11,15 +11,39 @@ struct stf_ctx_handle_t context ctx; }; -void stf_ctx_create(stf_ctx_handle* handle) +void stf_ctx_create(stf_ctx_handle* ctx) { - if (handle) { - *handle = new stf_ctx_handle_t{context{}}; + if (ctx) { + *ctx = new stf_ctx_handle_t{context{}}; } } -void stf_ctx_finalize(stf_ctx_handle handle) +void stf_ctx_finalize(stf_ctx_handle ctx) { - delete handle; + delete ctx; } + +struct stf_logical_data_handle_t +{ + // XXX should we always store a logical_data> instead ? + logical_data_untyped ld; +}; + +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz) +{ + assert(ld); + assert(ctx); + + // Create a slice logical data + auto ld_typed = ctx->ctx.logical_data(make_slice((char *)addr, sz)); + + // Stored in its untyped version + *ld = new stf_logical_data_handle_t{ld_typed}; +} + +void stf_logical_data_destroy(stf_ctx_handle /* ctx */, stf_logical_data_handle ld) +{ + delete ld; +} + } diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp new file mode 100644 index 00000000000..e91509708fb --- /dev/null +++ b/c/experimental/stf/test/test_logical_data.cpp @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +C2H_TEST("basic stf logical_data", "[logical_data]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lA, lB; + + float *A, *B; + A = (float *)malloc(N*sizeof(float)); + B = (float *)malloc(N*sizeof(float)); + + stf_logical_data(ctx, &lA, A, N*sizeof(float)); + stf_logical_data(ctx, &lB, B, N*sizeof(float)); + + stf_logical_data_destroy(ctx, lA); + stf_logical_data_destroy(ctx, lB); + + stf_ctx_finalize(ctx); + + free(A); + free(B); +} From 5fa13d37e0fc82352919fb88e76a7c0abd6e648c Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 19 Jul 2025 09:41:32 +0200 Subject: [PATCH 021/221] More bindings --- .../stf/include/cccl/c/experimental/stf/stf.h | 25 +++++- c/experimental/stf/src/stf.cu | 83 +++++++++++++++---- c/experimental/stf/test/test_ctx.cpp | 2 +- c/experimental/stf/test/test_logical_data.cpp | 14 ++-- c/experimental/stf/test/test_task.cpp | 78 +++++++++++++++++ 5 files changed, 178 insertions(+), 24 deletions(-) create mode 100644 c/experimental/stf/test/test_task.cpp diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index d21cea1563a..50aca91b29d 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1,8 +1,18 @@ +#include + // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END #ifdef __cplusplus extern "C" { #endif +typedef enum stf_access_mode +{ + STF_NONE = 0, + STF_READ = 1 << 0, + STF_WRITE = 1 << 1, + STF_RW = STF_READ | STF_WRITE +} stf_access_mode; + typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); @@ -10,11 +20,22 @@ void stf_ctx_finalize(stf_ctx_handle ctx); typedef struct stf_logical_data_handle_t* stf_logical_data_handle; -void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz); -void stf_logical_data_destroy(stf_ctx_handle ctx, stf_logical_data_handle ld); +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); +void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); +void stf_logical_data_destroy(stf_logical_data_handle ld); + +// TODO token typedef struct stf_task_handle_t* stf_task_handle; +void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); +void stf_task_set_symbol(stf_task_handle t, const char* symbol); +void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); +void stf_task_start(stf_task_handle t); +void stf_task_end(stf_task_handle t); +cudaStream_t stf_task_get_stream(stf_task_handle t); +void stf_task_destroy(stf_task_handle t); + #ifdef __cplusplus } #endif diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index f18ad9f22a0..8b10af298a6 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -11,39 +11,94 @@ struct stf_ctx_handle_t context ctx; }; +struct stf_logical_data_handle_t +{ + // XXX should we always store a logical_data> instead ? + logical_data_untyped ld; +}; + +struct stf_task_handle_t +{ + context::unified_task<> t; +}; + void stf_ctx_create(stf_ctx_handle* ctx) { - if (ctx) { + if (ctx) + { *ctx = new stf_ctx_handle_t{context{}}; } } void stf_ctx_finalize(stf_ctx_handle ctx) { + assert(ctx); delete ctx; } -struct stf_logical_data_handle_t +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) { - // XXX should we always store a logical_data> instead ? - logical_data_untyped ld; -}; + assert(ld); + assert(ctx); + + // Create a slice logical data + auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz)); + + // Stored in its untyped version + *ld = new stf_logical_data_handle_t{ld_typed}; +} + +void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) +{ + assert(ld); + ld->ld.set_symbol(symbol); +} -void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz) +void stf_logical_data_destroy(stf_logical_data_handle ld) { - assert(ld); - assert(ctx); + assert(ld); + delete ld; +} - // Create a slice logical data - auto ld_typed = ctx->ctx.logical_data(make_slice((char *)addr, sz)); +void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) +{ + assert(t); + assert(ctx); - // Stored in its untyped version - *ld = new stf_logical_data_handle_t{ld_typed}; + *t = new stf_task_handle_t{ctx->ctx.task()}; } -void stf_logical_data_destroy(stf_ctx_handle /* ctx */, stf_logical_data_handle ld) +void stf_task_set_symbol(stf_task_handle t, const char* symbol) { - delete ld; + assert(t); + assert(symbol); + + t->t.set_symbol(symbol); +} + +void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) +{ + assert(t); + assert(ld); + + t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); } +void stf_task_start(stf_task_handle t) +{ + assert(t); + t->t.start(); +} + +void stf_task_end(stf_task_handle t) +{ + assert(t); + t->t.end(); +} + +void stf_task_destroy(stf_task_handle t) +{ + assert(t); + delete t; +} } diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp index 6bbfc3d1e46..86225ad91c7 100644 --- a/c/experimental/stf/test/test_ctx.cpp +++ b/c/experimental/stf/test/test_ctx.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include #include C2H_TEST("basic stf context", "[context]") diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp index e91509708fb..168ca8dabbc 100644 --- a/c/experimental/stf/test/test_logical_data.cpp +++ b/c/experimental/stf/test/test_logical_data.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include #include C2H_TEST("basic stf logical_data", "[logical_data]") @@ -23,14 +23,14 @@ C2H_TEST("basic stf logical_data", "[logical_data]") stf_logical_data_handle lA, lB; float *A, *B; - A = (float *)malloc(N*sizeof(float)); - B = (float *)malloc(N*sizeof(float)); + A = (float*) malloc(N * sizeof(float)); + B = (float*) malloc(N * sizeof(float)); - stf_logical_data(ctx, &lA, A, N*sizeof(float)); - stf_logical_data(ctx, &lB, B, N*sizeof(float)); + stf_logical_data(ctx, &lA, A, N * sizeof(float)); + stf_logical_data(ctx, &lB, B, N * sizeof(float)); - stf_logical_data_destroy(ctx, lA); - stf_logical_data_destroy(ctx, lB); + stf_logical_data_destroy(lA); + stf_logical_data_destroy(lB); stf_ctx_finalize(ctx); diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp new file mode 100644 index 00000000000..693773e42d3 --- /dev/null +++ b/c/experimental/stf/test/test_task.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +C2H_TEST("basic stf task", "[task]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float *)malloc(N*sizeof(float)); + Y = (float *)malloc(N*sizeof(float)); + Z = (float *)malloc(N*sizeof(float)); + + stf_logical_data(ctx, &lX, X, N*sizeof(float)); + stf_logical_data(ctx, &lY, Y, N*sizeof(float)); + stf_logical_data(ctx, &lZ, Z, N*sizeof(float)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + stf_task_add_dep(t4, lZ, STF_RW); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} From 968995bba0f0a1afd8faafd9c12c3d1f74f32fb5 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 19 Jul 2025 09:41:52 +0200 Subject: [PATCH 022/221] expose start/end and task type in context --- .../experimental/__stf/internal/context.cuh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 00362b67a1d..eb003679794 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -164,6 +164,7 @@ class context ::std::variant payload; }; +public: /* * A task that can be either a stream task or a graph task. */ @@ -194,6 +195,22 @@ class context return mv(*this); } + auto& start() + { + payload->*[&](auto& self) { + self.start(); + }; + return *this; + } + + auto& end() + { + payload->*[&](auto& self) { + self.end(); + }; + return *this; + } + /** * @brief Add dependencies to this task. * @@ -238,7 +255,6 @@ class context ::std::variant, graph_task> payload; }; -public: /** * @brief Default constructor for the context class. */ From 8cc6a3c6897f778521a9cbe15c24f2a7820b7602 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 00:02:45 +0200 Subject: [PATCH 023/221] Save some WIP --- c/experimental/stf/CMakeLists.txt | 2 +- .../stf/include/cccl/c/experimental/stf/stf.h | 22 ++++++++++++++++++- c/experimental/stf/src/stf.cu | 6 +++++ c/experimental/stf/test/test_task.cpp | 2 +- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 5e8aa8e9f50..9d8c7130c68 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -29,12 +29,12 @@ endif() find_package(CUDAToolkit REQUIRED) set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) +target_link_libraries(cccl.c.experimental.stf PUBLIC CCCL::cudax) target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cudart_static CUDA::nvrtc CUDA::nvJitLink CUDA::cuda_driver - CCCL::cudax ) # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) # target_compile_definitions(cccl.c.experimental.stf PRIVATE diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 50aca91b29d..ab0568d4bf4 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -18,13 +18,20 @@ typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); void stf_ctx_finalize(stf_ctx_handle ctx); +// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ... +// TODO stf_ctx_is_graph() + +cudaStream_t stf_fence(stf_ctx_handle ctx); + typedef struct stf_logical_data_handle_t* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); void stf_logical_data_destroy(stf_logical_data_handle ld); -// TODO token +// TODO +// void stf_logical_data_wait(stf_logical_data_handle ld); +// void stf_token(stf_ctx_handle ctx); typedef struct stf_task_handle_t* stf_task_handle; @@ -34,8 +41,21 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ void stf_task_start(stf_task_handle t); void stf_task_end(stf_task_handle t); cudaStream_t stf_task_get_stream(stf_task_handle t); +void *stf_task_get(stf_task_handle t, size_t submitted_index); void stf_task_destroy(stf_task_handle t); +typedef struct stf_kernel_desc_handle_t *stf_kernel_desc_handle; + +void stf_kernel_create(stf_kernel_desc_handle *d); +void stf_kernel_destroy(stf_kernel_desc_handle d); +// TODO stf_cuda_kernel_desc : symbol, deps, args... ? +// void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol) +// void stf_kernel_add_dep(stf_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); +// void stf_kernel_start(stf_kernel_handle k); +// void stf_kernel_set_args(stf_kernel_handle k, size_t cnt, void **args); +// void stf_kernel_end(stf_kernel_handle k); +// void stf_kernel_destroy(stf_kernel_handle k); + #ifdef __cplusplus } #endif diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 8b10af298a6..9636d2b1e4a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -36,6 +36,12 @@ void stf_ctx_finalize(stf_ctx_handle ctx) delete ctx; } +cudaStream_t stf_fence(stf_ctx_handle ctx) +{ + assert(ctx); + return ctx->ctx.fence(); +} + void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) { assert(ld); diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp index 693773e42d3..4cf11e31846 100644 --- a/c/experimental/stf/test/test_task.cpp +++ b/c/experimental/stf/test/test_task.cpp @@ -13,7 +13,7 @@ #include -C2H_TEST("basic stf task", "[task]") +C2H_TEST("empty stf tasks", "[task]") { size_t N = 1000000; From 49064eea864596cdc091815c8ff3ed74450dd2de Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 08:03:43 +0200 Subject: [PATCH 024/221] Save some WIP --- .../stf/include/cccl/c/experimental/stf/stf.h | 6 +- c/experimental/stf/src/stf.cu | 70 ++++++++++++++++++- c/experimental/stf/test/test_task.cpp | 14 ++-- 3 files changed, 78 insertions(+), 12 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index ab0568d4bf4..f83bf1e5f40 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -41,12 +41,12 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ void stf_task_start(stf_task_handle t); void stf_task_end(stf_task_handle t); cudaStream_t stf_task_get_stream(stf_task_handle t); -void *stf_task_get(stf_task_handle t, size_t submitted_index); +void* stf_task_get(stf_task_handle t, size_t submitted_index); void stf_task_destroy(stf_task_handle t); -typedef struct stf_kernel_desc_handle_t *stf_kernel_desc_handle; +typedef struct stf_kernel_desc_handle_t* stf_kernel_desc_handle; -void stf_kernel_create(stf_kernel_desc_handle *d); +void stf_kernel_create(stf_kernel_desc_handle* d); void stf_kernel_destroy(stf_kernel_desc_handle d); // TODO stf_cuda_kernel_desc : symbol, deps, args... ? // void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 9636d2b1e4a..9f40efb86dc 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -38,8 +38,8 @@ void stf_ctx_finalize(stf_ctx_handle ctx) cudaStream_t stf_fence(stf_ctx_handle ctx) { - assert(ctx); - return ctx->ctx.fence(); + assert(ctx); + return ctx->ctx.fence(); } void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) @@ -107,4 +107,70 @@ void stf_task_destroy(stf_task_handle t) assert(t); delete t; } + +/** + * Low level example of cuda_kernel(_chain) + * auto t = ctx.cuda_kernel_chain(); + t.add_deps(lX.read()); + t.add_deps(lY.rw()); + t->*[&]() { + auto dX = t.template get>(0); + auto dY = t.template get>(1); + return std::vector { + { axpy, 16, 128, 0, alpha, dX, dY }, + { axpy, 16, 128, 0, beta, dX, dY }, + { axpy, 16, 128, 0, gamma, dX, dY } + }; + }; + + * + */ +struct stf_cuda_kernel_handle_t +{ + // return type of ctx.cuda_kernel() + using kernel_type = decltype(::std::declval().cuda_kernel()); + kernel_type k; +}; + +void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) +{ + assert(k); + assert(ctx); + + *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; +} + +void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) +{ + assert(k); + assert(symbol); + + k->k.set_symbol(symbol); +} + +void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m) +{ + assert(k); + assert(ld); + + k->k.add_deps(cuda_kernel_dep_untyped(ld->ld, access_mode(m))); +} + +// void stf_cuda_kernel_start(stf_cuda_kernel_handle k) +// { +// assert(k); +// k->k.start(); +// } +// +// void stf_cuda_kernel_end(stf_cuda_kernel_handle k) +// { +// assert(k); +// k->k.end(); +// } + +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) +{ + assert(t); + delete t; +} } diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp index 4cf11e31846..80266f6b381 100644 --- a/c/experimental/stf/test/test_task.cpp +++ b/c/experimental/stf/test/test_task.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include #include C2H_TEST("empty stf tasks", "[task]") @@ -23,13 +23,13 @@ C2H_TEST("empty stf tasks", "[task]") stf_logical_data_handle lX, lY, lZ; float *X, *Y, *Z; - X = (float *)malloc(N*sizeof(float)); - Y = (float *)malloc(N*sizeof(float)); - Z = (float *)malloc(N*sizeof(float)); + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); - stf_logical_data(ctx, &lX, X, N*sizeof(float)); - stf_logical_data(ctx, &lY, Y, N*sizeof(float)); - stf_logical_data(ctx, &lZ, Z, N*sizeof(float)); + stf_logical_data(ctx, &lX, X, N * sizeof(float)); + stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + stf_logical_data(ctx, &lZ, Z, N * sizeof(float)); stf_logical_data_set_symbol(lX, "X"); stf_logical_data_set_symbol(lY, "Y"); From f06f72e6a70e9814a85732090851abfe99d495f3 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 20 Jul 2025 10:31:35 +0200 Subject: [PATCH 025/221] Start to experiment with bindings for cuda_kernel --- .../stf/include/cccl/c/experimental/stf/stf.h | 19 +++--- c/experimental/stf/src/stf.cu | 28 ++++---- c/experimental/stf/test/test_cuda_kernel.cpp | 65 +++++++++++++++++++ 3 files changed, 87 insertions(+), 25 deletions(-) create mode 100644 c/experimental/stf/test/test_cuda_kernel.cpp diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index f83bf1e5f40..5e4b9ac05ff 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -44,17 +44,14 @@ cudaStream_t stf_task_get_stream(stf_task_handle t); void* stf_task_get(stf_task_handle t, size_t submitted_index); void stf_task_destroy(stf_task_handle t); -typedef struct stf_kernel_desc_handle_t* stf_kernel_desc_handle; - -void stf_kernel_create(stf_kernel_desc_handle* d); -void stf_kernel_destroy(stf_kernel_desc_handle d); -// TODO stf_cuda_kernel_desc : symbol, deps, args... ? -// void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol) -// void stf_kernel_add_dep(stf_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); -// void stf_kernel_start(stf_kernel_handle k); -// void stf_kernel_set_args(stf_kernel_handle k, size_t cnt, void **args); -// void stf_kernel_end(stf_kernel_handle k); -// void stf_kernel_destroy(stf_kernel_handle k); +typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; + +void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); +void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); +void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); +void stf_cuda_kernel_start(stf_cuda_kernel_handle k); +void stf_cuda_kernel_end(stf_cuda_kernel_handle k); +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t); #ifdef __cplusplus } diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 9f40efb86dc..c897afaa66a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -153,20 +153,20 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l assert(k); assert(ld); - k->k.add_deps(cuda_kernel_dep_untyped(ld->ld, access_mode(m))); -} - -// void stf_cuda_kernel_start(stf_cuda_kernel_handle k) -// { -// assert(k); -// k->k.start(); -// } -// -// void stf_cuda_kernel_end(stf_cuda_kernel_handle k) -// { -// assert(k); -// k->k.end(); -// } + k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m))); +} + +void stf_cuda_kernel_start(stf_cuda_kernel_handle k) +{ + assert(k); + k->k.start(); +} + +void stf_cuda_kernel_end(stf_cuda_kernel_handle k) +{ + assert(k); + k->k.end(); +} void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) { diff --git a/c/experimental/stf/test/test_cuda_kernel.cpp b/c/experimental/stf/test/test_cuda_kernel.cpp new file mode 100644 index 00000000000..3cb3606fd3a --- /dev/null +++ b/c/experimental/stf/test/test_cuda_kernel.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +using namespace cuda::experimental::stf; + +__global__ void axpy(int cnt, double a, const double *x, double *y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < cnt; i += nthreads) + { + y[i] += a * x[i]; + } +} + +C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY; + + float *X, *Y; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + + stf_logical_data(ctx, &lX, X, N * sizeof(float)); + stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + + stf_cuda_kernel_handle k; + stf_cuda_kernel_create(ctx, &k); + stf_cuda_kernel_set_symbol(k, "axpy"); + stf_cuda_kernel_add_dep(k, lX, STF_READ); + stf_cuda_kernel_add_dep(k, lY, STF_RW); + stf_cuda_kernel_start(k); + // TODO add descs + stf_cuda_kernel_end(k); + stf_cuda_kernel_destroy(k); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); +} From 08a22c9d08803ac4f1eef13de13f9d0374d87979 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 11:48:40 +0200 Subject: [PATCH 026/221] Save WIP: we cannot directly use the pointer to a global function and pass it to a shared library, so we convert it to a CUfunction prior to calling a function in the shared library (so we do it in the header) --- c/experimental/stf/CMakeLists.txt | 2 +- .../stf/include/cccl/c/experimental/stf/stf.h | 31 ++++++++++++++ c/experimental/stf/src/stf.cu | 42 +++++++++++++++++++ ...st_cuda_kernel.cpp => test_cuda_kernel.cu} | 21 ++++++---- 4 files changed, 88 insertions(+), 8 deletions(-) rename c/experimental/stf/test/{test_cuda_kernel.cpp => test_cuda_kernel.cu} (82%) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 9d8c7130c68..5e8aa8e9f50 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -29,12 +29,12 @@ endif() find_package(CUDAToolkit REQUIRED) set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) -target_link_libraries(cccl.c.experimental.stf PUBLIC CCCL::cudax) target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cudart_static CUDA::nvrtc CUDA::nvJitLink CUDA::cuda_driver + CCCL::cudax ) # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) # target_compile_definitions(cccl.c.experimental.stf PRIVATE diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 5e4b9ac05ff..3f118af4685 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1,3 +1,5 @@ +#include +#include #include // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END @@ -50,6 +52,35 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); void stf_cuda_kernel_start(stf_cuda_kernel_handle k); + +void stf_cuda_kernel_add_desc_cufunc( + stf_cuda_kernel_handle k, + CUfunction cufunc, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args); + +/* Convert CUDA kernel address to CUfunction because we may use them from a + * shared library where this would be invalid in the runtime API. */ +static inline void stf_cuda_kernel_add_desc( + stf_cuda_kernel_handle k, + const void* func, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args) +{ + CUfunction cufunc; + cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); + assert(res == cudaSuccess); + + stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); +} + +void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index); void stf_cuda_kernel_end(stf_cuda_kernel_handle k); void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index c897afaa66a..63eb470b1ef 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -162,6 +162,48 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k) k->k.start(); } +#if 0 +// +// template +// void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args) +void stf_cuda_kernel_add_desc(stf_cuda_kernel_handle k, const void *func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args) +{ + /* We convert the function to a CUfunction because this code is a shared + * library which cannot launch kernels using cudaLaunchKernel directly, or we + * will get invalid device function. */ + //CUfunction cufunc; + //cudaGetFuncBySymbol(&cufunc, (void *)func); + CUkernel cukernel; + cudaGetKernel(&cukernel, (void *)func); + + cuda_kernel_desc desc; + desc.configure_raw(cukernel, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + + k->k.add_kernel_desc(mv(desc)); +} +#endif + +void stf_cuda_kernel_add_desc_cufunc( + stf_cuda_kernel_handle k, + CUfunction cufunc, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args) +{ + cuda_kernel_desc desc; + desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + + k->k.add_kernel_desc(mv(desc)); +} + +void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index) +{ + auto s = k->k.template get>(index); + return (void*) s.data_handle(); +} + void stf_cuda_kernel_end(stf_cuda_kernel_handle k) { assert(k); diff --git a/c/experimental/stf/test/test_cuda_kernel.cpp b/c/experimental/stf/test/test_cuda_kernel.cu similarity index 82% rename from c/experimental/stf/test/test_cuda_kernel.cpp rename to c/experimental/stf/test/test_cuda_kernel.cu index 3cb3606fd3a..8ba6c0e90da 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cpp +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -13,17 +13,22 @@ #include #include -using namespace cuda::experimental::stf; - +#if 0 __global__ void axpy(int cnt, double a, const double *x, double *y) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; - for (int i = tid; i < cnt; i += nthreads) - { - y[i] += a * x[i]; - } +// for (int i = tid; i < cnt; i += nthreads) +// { +// y[i] += a * x[i]; +// } +} +#endif + +extern "C" __global__ void axpy(int, double, const double*, double*) +{ + printf("hello.\n"); } C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") @@ -51,7 +56,9 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") stf_cuda_kernel_add_dep(k, lX, STF_READ); stf_cuda_kernel_add_dep(k, lY, STF_RW); stf_cuda_kernel_start(k); - // TODO add descs + void* dummy = nullptr; + const void* args[4] = {&N, &alpha, &dummy, &dummy}; + stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); stf_cuda_kernel_end(k); stf_cuda_kernel_destroy(k); From 1482bbc3a9d9311dc3212336acc8c6ff618b8fcf Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 11:55:16 +0200 Subject: [PATCH 027/221] add missing variable --- c/experimental/stf/test/test_cuda_kernel.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index 8ba6c0e90da..90eb7e668c5 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -44,6 +44,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") X = (float*) malloc(N * sizeof(float)); Y = (float*) malloc(N * sizeof(float)); + const double alpha = 3.14; + stf_logical_data(ctx, &lX, X, N * sizeof(float)); stf_logical_data(ctx, &lY, Y, N * sizeof(float)); From f45d8dabcb422668807971edf68715809e291f66 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 11:58:07 +0200 Subject: [PATCH 028/221] Add missing finalize() call --- c/experimental/stf/src/stf.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 63eb470b1ef..70e9a972b2f 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -32,6 +32,7 @@ void stf_ctx_create(stf_ctx_handle* ctx) void stf_ctx_finalize(stf_ctx_handle ctx) { + ctx->ctx.finalize(); assert(ctx); delete ctx; } From 8b03c29b9e272aedcea89df93c700c7e9aa6042c Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 14:46:20 +0200 Subject: [PATCH 029/221] axpy example works with cuda_kernel in C --- c/experimental/stf/test/test_cuda_kernel.cu | 32 +++++++++------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index 90eb7e668c5..e8494a1d95b 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -13,22 +13,15 @@ #include #include -#if 0 -__global__ void axpy(int cnt, double a, const double *x, double *y) +__global__ void axpy(int cnt, double a, const double* x, double* y) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; -// for (int i = tid; i < cnt; i += nthreads) -// { -// y[i] += a * x[i]; -// } -} -#endif - -extern "C" __global__ void axpy(int, double, const double*, double*) -{ - printf("hello.\n"); + for (int i = tid; i < cnt; i += nthreads) + { + y[i] += a * x[i]; + } } C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") @@ -40,14 +33,14 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") stf_logical_data_handle lX, lY; - float *X, *Y; - X = (float*) malloc(N * sizeof(float)); - Y = (float*) malloc(N * sizeof(float)); + double *X, *Y; + X = (double*) malloc(N * sizeof(double)); + Y = (double*) malloc(N * sizeof(double)); const double alpha = 3.14; - stf_logical_data(ctx, &lX, X, N * sizeof(float)); - stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + stf_logical_data(ctx, &lX, X, N * sizeof(double)); + stf_logical_data(ctx, &lY, Y, N * sizeof(double)); stf_logical_data_set_symbol(lX, "X"); stf_logical_data_set_symbol(lY, "Y"); @@ -58,8 +51,9 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") stf_cuda_kernel_add_dep(k, lX, STF_READ); stf_cuda_kernel_add_dep(k, lY, STF_RW); stf_cuda_kernel_start(k); - void* dummy = nullptr; - const void* args[4] = {&N, &alpha, &dummy, &dummy}; + double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); + double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); + const void* args[4] = {&N, &alpha, &dX, &dY}; stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); stf_cuda_kernel_end(k); stf_cuda_kernel_destroy(k); From 749ca3b42f1db6d411ed641faa12e9451ca8c0f8 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 18:21:47 +0200 Subject: [PATCH 030/221] check result --- c/experimental/stf/test/test_cuda_kernel.cu | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index e8494a1d95b..b5ba66b0f3a 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -24,6 +24,16 @@ __global__ void axpy(int cnt, double a, const double* x, double* y) } } +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") { size_t N = 1000000; @@ -37,6 +47,12 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") X = (double*) malloc(N * sizeof(double)); Y = (double*) malloc(N * sizeof(double)); + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + const double alpha = 3.14; stf_logical_data(ctx, &lX, X, N * sizeof(double)); @@ -63,6 +79,12 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") stf_ctx_finalize(ctx); + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } + free(X); free(Y); } From 4fb70006073558ae9eedb695efa2a505cb438b7a Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Mon, 21 Jul 2025 18:42:39 +0200 Subject: [PATCH 031/221] Add ctx_token --- .../stf/include/cccl/c/experimental/stf/stf.h | 3 +- c/experimental/stf/src/stf.cu | 8 ++ c/experimental/stf/test/test_token.cpp | 78 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 c/experimental/stf/test/test_token.cpp diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 3f118af4685..359425aadfb 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -33,7 +33,8 @@ void stf_logical_data_destroy(stf_logical_data_handle ld); // TODO // void stf_logical_data_wait(stf_logical_data_handle ld); -// void stf_token(stf_ctx_handle ctx); + +void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); typedef struct stf_task_handle_t* stf_task_handle; diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 70e9a972b2f..df474582d21 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -67,6 +67,14 @@ void stf_logical_data_destroy(stf_logical_data_handle ld) delete ld; } +void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) +{ + assert(ctx); + assert(ld); + + *ld = new stf_logical_data_handle_t{ctx->ctx.token()}; +} + void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) { assert(t); diff --git a/c/experimental/stf/test/test_token.cpp b/c/experimental/stf/test/test_token.cpp new file mode 100644 index 00000000000..ccd7f0a9e2c --- /dev/null +++ b/c/experimental/stf/test/test_token.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("stf token", "[token]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); + + stf_token(ctx, &lX); + stf_token(ctx, &lY); + stf_token(ctx, &lZ); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + stf_task_add_dep(t4, lZ, STF_RW); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} From 94757f84107aedbf2d8c0f43248e7e2f7ace8286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 22 Jul 2025 16:52:37 +0200 Subject: [PATCH 032/221] make cudax usable in the python dir --- python/cuda_cccl/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 5fe0ba6d692..7736023692d 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -22,6 +22,7 @@ cccl_build_compiler_targets() # Build and install C++ library first set(CCCL_ENABLE_C ON) +set(CCCL_ENABLE_UNSTABLE ON) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) add_subdirectory(${_cccl_root} _parent_cccl) @@ -29,6 +30,7 @@ add_subdirectory(${_cccl_root} _parent_cccl) find_package(CUB REQUIRED) find_package(Thrust REQUIRED) find_package(libcudacxx REQUIRED) +find_package(cudax REQUIRED) # Install headers set(_dest_incl_dir cuda/cccl/headers/include) From 189e0832acdf8a0939c8041f0e7b52b876dcf500 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 23 Jul 2025 11:30:24 +0200 Subject: [PATCH 033/221] Save WIP with python --- python/cuda_cccl/CMakeLists.txt | 23 +++++++++++ .../experimental/stf/_stf_bindings_impl.pyx | 38 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 7736023692d..b1f693078b1 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -114,3 +114,26 @@ target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver) set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") install(TARGETS _bindings_impl DESTINATION cuda/cccl/parallel/experimental) + +message(STATUS "STF Using Cython ${CYTHON_VERSION}") +set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx") +set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c") +set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep") +add_custom_command( + OUTPUT "${_stf_generated_extension_src}" + COMMAND "${Python3_EXECUTABLE}" -m cython + ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src} + DEPENDS "${stf_pyx_source_file}" + DEPFILE "${_stf_depfile}" +) +set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE) +add_custom_target(cythonize_stf_bindings_impl ALL + DEPENDS "${_stf_generated_extension_src}" +) + +Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}") +add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl) +target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver) +set_target_properties(_stf_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") + +install(TARGETS _stf_bindings_impl DESTINATION cuda/cccl/experimental/stf) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx new file mode 100644 index 00000000000..913016c2b35 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -0,0 +1,38 @@ +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +# Python signatures are declared in the companion Python stub file _bindings.pyi +# Make sure to update PYI with change to Python API to ensure that Python +# static type checker tools like mypy green-lights cuda.cccl.parallel + +from libc.string cimport memset, memcpy +from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t +from cpython.bytes cimport PyBytes_FromStringAndSize + +from cpython.buffer cimport ( + Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, + PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer +) +from cpython.pycapsule cimport ( + PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer +) + +import ctypes + +cdef extern from "": + cdef struct OpaqueCUstream_st + cdef struct OpaqueCUkernel_st + cdef struct OpaqueCUlibrary_st + + ctypedef int CUresult + ctypedef OpaqueCUstream_st *CUstream + ctypedef OpaqueCUkernel_st *CUkernel + ctypedef OpaqueCUlibrary_st *CUlibrary + +cdef extern from "cccl/c/experimental/stf/stf.h": + ctypedef struct stf_ctx_handle_t + ctypedef stf_ctx_handle_t* stf_ctx_handle + + void stf_ctx_create(stf_ctx_handle* ctx) + void stf_ctx_finalize(stf_ctx_handle ctx) From 3ed26acdccca4c9caa4f4660fa41b2b562b440d2 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 23 Jul 2025 17:27:52 +0200 Subject: [PATCH 034/221] fix a typo --- c/experimental/stf/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 5e8aa8e9f50..13abde52a2d 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -21,7 +21,7 @@ cccl_configure_target(cccl.c.experimental.stf DIALECT 17) # Override the properties set by cccl_configure_target: if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) - set_target_properties(cccl.c.parallel PROPERTIES + set_target_properties(cccl.c.experimental.stf PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" ) From 1a5038ad2cd650acbc1bd52881885dfdb9061493 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 23 Jul 2025 17:33:59 +0200 Subject: [PATCH 035/221] fixed in python for stf --- python/cuda_cccl/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index b1f693078b1..ff7c2c8f8ca 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -24,6 +24,7 @@ cccl_build_compiler_targets() set(CCCL_ENABLE_C ON) set(CCCL_ENABLE_UNSTABLE ON) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) +set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) add_subdirectory(${_cccl_root} _parent_cccl) # Now we can find CUB and other components From 0c1a2ae5a306cec47a1f46117ea1bdeabc1531da Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 23 Jul 2025 18:14:20 +0200 Subject: [PATCH 036/221] Add a minimalistic Ctx class --- .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 913016c2b35..53775e61df3 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -36,3 +36,14 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_ctx_create(stf_ctx_handle* ctx) void stf_ctx_finalize(stf_ctx_handle ctx) + +cdef class Ctx: + cdef stf_ctx_handle _ctx + + def __cinit__(self): + stf_ctx_create(&self._ctx) + + def __dealloc__(self): + if self._ctx != NULL: + stf_ctx_finalize(self._ctx) + self._ctx = NULL From 2f519e8dce30a0388cb7dd9e3d14c74b5e05964d Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Wed, 23 Jul 2025 19:06:12 +0200 Subject: [PATCH 037/221] Fix installation paths --- c/experimental/stf/CMakeLists.txt | 2 +- python/cuda_cccl/CMakeLists.txt | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 13abde52a2d..f151e8bf766 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -7,7 +7,7 @@ option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tes # FIXME Ideally this would be handled by presets and install rules, but for now # consumers may override this to control the target location of cccl.c.experimental.stf. set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library") -mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY) +mark_as_advanced(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) file(GLOB_RECURSE srcs RELATIVE "${CMAKE_CURRENT_LIST_DIR}" diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index ff7c2c8f8ca..f3fc06163e2 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -58,6 +58,11 @@ install( DESTINATION cuda/cccl/parallel/experimental/cccl ) +install( + TARGETS cccl.c.experimental.stf + DESTINATION cuda/cccl/experimental/stf/cccl +) + # Build and install Cython extension find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) From 469ff3352482f9cf2259b5067fb0a3f6b1a7f920 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 00:50:45 +0200 Subject: [PATCH 038/221] Add a dummy STF test --- python/cuda_cccl/tests/stf/test_context.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_context.py diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py new file mode 100644 index 00000000000..b97922488fd --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from cuda.cccl.experimental.stf import _stf_bindings_impl + +def test_ctx(): + ctx = _stf_bindings_impl.Ctx() + del ctx From c2a8fde9513b021c67d056901bd1f1843b8fdba2 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 08:12:40 +0200 Subject: [PATCH 039/221] logical_data bindings --- .../experimental/stf/_stf_bindings_impl.pyx | 49 +++++++++++++++++-- python/cuda_cccl/tests/stf/test_context.py | 15 +++++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 53775e61df3..2bad6f936b8 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -6,9 +6,11 @@ # Make sure to update PYI with change to Python API to ensure that Python # static type checker tools like mypy green-lights cuda.cccl.parallel -from libc.string cimport memset, memcpy -from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t +from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release from cpython.bytes cimport PyBytes_FromStringAndSize +from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t +from libc.stdint cimport uintptr_t +from libc.string cimport memset, memcpy from cpython.buffer cimport ( Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, @@ -33,11 +35,39 @@ cdef extern from "": cdef extern from "cccl/c/experimental/stf/stf.h": ctypedef struct stf_ctx_handle_t ctypedef stf_ctx_handle_t* stf_ctx_handle - void stf_ctx_create(stf_ctx_handle* ctx) void stf_ctx_finalize(stf_ctx_handle ctx) -cdef class Ctx: + ctypedef struct stf_logical_data_handle_t + ctypedef stf_logical_data_handle_t* stf_logical_data_handle + void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) + void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) + void stf_logical_data_destroy(stf_logical_data_handle ld) + +# Python-visible class +cdef class logical_data: + cdef stf_logical_data_handle _ld + + def __cinit__(self, context ctx, object buf): + cdef Py_buffer view + if PyObject_GetBuffer(buf, &view, PyBUF_SIMPLE) != 0: + raise ValueError("object doesn’t support the buffer protocol") + + try: + stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len) + + finally: + PyBuffer_Release(&view) + + def set_symbol(self, str name): + stf_logical_data_set_symbol(self._ld, name.encode()) + + def __dealloc__(self): + if self._ld != NULL: + stf_logical_data_destroy(self._ld) + self._ld = NULL + +cdef class context: cdef stf_ctx_handle _ctx def __cinit__(self): @@ -47,3 +77,14 @@ cdef class Ctx: if self._ctx != NULL: stf_ctx_finalize(self._ctx) self._ctx = NULL + + def logical_data(self, object buf): + """ + Create and return a `logical_data` object bound to this context. + + Parameters + ---------- + buf : any buffer‑supporting Python object + (NumPy array, bytes, bytearray, memoryview, …) + """ + return logical_data(self, buf) diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index b97922488fd..df0c75d58fb 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -2,8 +2,19 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from cuda.cccl.experimental.stf import _stf_bindings_impl +from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context +import ctypes +import numpy as np def test_ctx(): - ctx = _stf_bindings_impl.Ctx() + ctx = _stf_bindings_impl.context() + del ctx + +def test_ctx2(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + + ctx = _stf_bindings_impl.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) del ctx From 3a39aa535a074e3f4a88fc7d2183fac41dd03b19 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 09:51:13 +0200 Subject: [PATCH 040/221] deps --- .../experimental/stf/_stf_bindings_impl.pyx | 47 ++++++++++++++++++- python/cuda_cccl/tests/stf/test_context.py | 15 +++++- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 2bad6f936b8..3c0606f6d6d 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -21,6 +21,7 @@ from cpython.pycapsule cimport ( ) import ctypes +from enum import IntFlag cdef extern from "": cdef struct OpaqueCUstream_st @@ -44,7 +45,29 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) void stf_logical_data_destroy(stf_logical_data_handle ld) -# Python-visible class + ctypedef struct stf_task_handle_t + ctypedef stf_task_handle_t* stf_task_handle + void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) + void stf_task_set_symbol(stf_task_handle t, const char* symbol) + void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) + void stf_task_start(stf_task_handle t) + void stf_task_end(stf_task_handle t) + # cudaStream_t stf_task_get_stream(stf_task_handle t) + void* stf_task_get(stf_task_handle t, size_t submitted_index) + void stf_task_destroy(stf_task_handle t) + + cdef enum stf_access_mode: + STF_NONE + STF_READ + STF_WRITE + STF_RW + +class AccessMode(IntFlag): + NONE = STF_NONE + READ = STF_READ + WRITE = STF_WRITE + RW = STF_RW + cdef class logical_data: cdef stf_logical_data_handle _ld @@ -67,6 +90,25 @@ cdef class logical_data: stf_logical_data_destroy(self._ld) self._ld = NULL +cdef class task: + cdef stf_task_handle _t + + def __cinit__(self, context ctx): + stf_task_create(ctx._ctx, &self._t) + + def __dealloc__(self): + if self._t != NULL: + stf_task_destroy(self._t) + + def start(self): + stf_task_start(self._t) + + def end(self): + stf_task_end(self._t) + + def add_dep(self, logical_data ld, int mode): + stf_task_add_dep(self._t, ld._ld, mode) + cdef class context: cdef stf_ctx_handle _ctx @@ -88,3 +130,6 @@ cdef class context: (NumPy array, bytes, bytearray, memoryview, …) """ return logical_data(self, buf) + + def task(self): + return task(self) diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index df0c75d58fb..6c274298922 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context +from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode import ctypes import numpy as np @@ -14,7 +14,18 @@ def test_ctx2(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) - ctx = _stf_bindings_impl.context() + ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) + + t = ctx.task() + t.add_dep(lX, AccessMode.READ.value) + t.add_dep(lY, AccessMode.RW.value) + t.start() + t.end() + del ctx + +if __name__ == "__main__": + print("Running CUDASTF examples...") + test_ctx2() From 4491ff37232178afc63a92c3c36b489ec16e179b Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 10:49:41 +0200 Subject: [PATCH 041/221] better task api --- .../experimental/stf/_stf_bindings_impl.pyx | 46 +++++++++++++++++-- python/cuda_cccl/tests/stf/test_context.py | 11 +++-- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 3c0606f6d6d..7f600ff50d0 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -90,6 +90,22 @@ cdef class logical_data: stf_logical_data_destroy(self._ld) self._ld = NULL +class dep: + __slots__ = ("ld", "mode") + def __init__(self, logical_data ld, int mode): + self.ld = ld + self.mode = mode + def __iter__(self): # nice unpacking support + yield self.ld + yield self.mode + def __repr__(self): + return f"dep({self.ld!r}, {self.mode})" + +# optional sugar +def read(ld): return dep(ld, AccessMode.READ.value) +def write(ld): return dep(ld, AccessMode.WRITE.value) +def rw(ld): return dep(ld, AccessMode.RW.value) + cdef class task: cdef stf_task_handle _t @@ -106,8 +122,18 @@ cdef class task: def end(self): stf_task_end(self._t) - def add_dep(self, logical_data ld, int mode): - stf_task_add_dep(self._t, ld._ld, mode) + def add_dep(self, object d): + """ + Accept a `dep` instance created with read(ld), write(ld), or rw(ld). + """ + if not isinstance(d, dep): + raise TypeError("add_dep expects read(ld), write(ld) or rw(ld)") + + cdef logical_data ldata = d.ld + cdef int mode_int = int(d.mode) + cdef stf_access_mode mode_ce = mode_int + + stf_task_add_dep(self._t, ldata._ld, mode_ce) cdef class context: cdef stf_ctx_handle _ctx @@ -131,5 +157,17 @@ cdef class context: """ return logical_data(self, buf) - def task(self): - return task(self) + def task(self, *deps): + """ + Create a `task` + + Example + ------- + >>> t = ctx.task(read(lX), rw(lY)) + >>> t.start() + >>> t.end() + """ + t = task(self) # construct with this context + for d in deps: + t.add_dep(d) # your existing add_dep logic + return t diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index 6c274298922..fe5d5d753bc 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode +from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write import ctypes import numpy as np @@ -18,12 +18,15 @@ def test_ctx2(): lX = ctx.logical_data(X) lY = ctx.logical_data(Y) - t = ctx.task() - t.add_dep(lX, AccessMode.READ.value) - t.add_dep(lY, AccessMode.RW.value) + t = ctx.task(read(lX), rw(lY)) t.start() t.end() + t2 = ctx.task() + t2.add_dep(rw(lX)) + t2.start() + t2.end() + del ctx if __name__ == "__main__": From 74b430ce4b1e2011a6edcb2cadbe3d5cc04cd1e5 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 11:21:24 +0200 Subject: [PATCH 042/221] test with context managers --- .../experimental/stf/_stf_bindings_impl.pyx | 12 ++++++ python/cuda_cccl/tests/stf/test_context.py | 41 +++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 7f600ff50d0..a1f94b955cf 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -135,6 +135,18 @@ cdef class task: stf_task_add_dep(self._t, ldata._ld, mode_ce) + # ---- context‑manager helpers ------------------------------- + def __enter__(self): + self.start() + return self + + def __exit__(self, object exc_type, object exc, object tb): + """ + Always called, even if an exception occurred inside the block. + """ + self.end() + return False + cdef class context: cdef stf_ctx_handle _ctx diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index fe5d5d753bc..8fbda4e4105 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -13,22 +13,55 @@ def test_ctx(): def test_ctx2(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Y) - t = ctx.task(read(lX), rw(lY)) + t = ctx.task(rw(lX)) t.start() t.end() - t2 = ctx.task() - t2.add_dep(rw(lX)) + t2 = ctx.task(read(lX), rw(lY)) t2.start() t2.end() + t3 = ctx.task(read(lX), rw(lZ)) + t3.start() + t3.end() + + t4 = ctx.task(read(lY), rw(lZ)) + t4.start() + t4.end() + + del ctx + +def test_ctx3(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Y) + + with ctx.task(rw(lX)): + pass + + with ctx.task(read(lX), rw(lY)): + pass + + with ctx.task(read(lX), rw(lZ)): + pass + + with ctx.task(read(lY), rw(lZ)): + pass + del ctx if __name__ == "__main__": print("Running CUDASTF examples...") - test_ctx2() + test_ctx3() From 479c24b9180354fa97e60e7cfde203313c384412 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 12:49:19 +0200 Subject: [PATCH 043/221] context task get_stream --- .../experimental/__stf/internal/context.cuh | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index eb003679794..ff6b69859b4 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -251,6 +251,16 @@ public: }; } + cudaStream_t get_stream() const + { + if (auto p = ::std::get_if>(&payload)) + { + return p->get_stream(); + } + + return nullptr; + } + private: ::std::variant, graph_task> payload; }; @@ -1501,6 +1511,32 @@ UNITTEST("token vector") ctx.finalize(); }; +UNITTEST("get_stream") +{ + context ctx; + + auto token = ctx.token(); + auto t = ctx.task(token.write()); + t.start(); + cudaStream_t s = t.get_stream(); + EXPECT(s != nullptr); + t.end(); + ctx.finalize(); +}; + +UNITTEST("get_stream graph") +{ + context ctx = graph_ctx(); + + auto token = ctx.token(); + auto t = ctx.task(token.write()); + t.start(); + cudaStream_t s = t.get_stream(); + EXPECT(s == nullptr); + t.end(); + ctx.finalize(); +}; + #endif // UNITTESTED_FILE } // end namespace cuda::experimental::stf From d4286511e771b2a9d131f4ef18b307560b44d321 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 12:49:34 +0200 Subject: [PATCH 044/221] Fix python examples --- python/cuda_cccl/tests/stf/test_context.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index 8fbda4e4105..79f443ac2cd 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -2,14 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write -import ctypes import numpy as np +from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw + + def test_ctx(): - ctx = _stf_bindings_impl.context() + ctx = context() del ctx + def test_ctx2(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) @@ -18,7 +20,7 @@ def test_ctx2(): ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) - lZ = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) t = ctx.task(rw(lX)) t.start() @@ -38,6 +40,7 @@ def test_ctx2(): del ctx + def test_ctx3(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) @@ -46,7 +49,7 @@ def test_ctx3(): ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) - lZ = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) with ctx.task(rw(lX)): pass @@ -62,6 +65,7 @@ def test_ctx3(): del ctx + if __name__ == "__main__": print("Running CUDASTF examples...") test_ctx3() From f7c74628e0d45e915e54b269786c77bbf746ddc0 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 12:59:05 +0200 Subject: [PATCH 045/221] fix unused var --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 359425aadfb..29a882d75e0 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -75,7 +75,7 @@ static inline void stf_cuda_kernel_add_desc( const void** args) { CUfunction cufunc; - cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); + [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); assert(res == cudaSuccess); stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); From 17a31b9b1d5c82c4ca65b81f9c39cbd3488c5517 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 15:26:40 +0200 Subject: [PATCH 046/221] Add const qualifiers --- cudax/include/cuda/experimental/__stf/stream/stream_task.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index b8143ac57d7..e5552721813 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -73,7 +73,7 @@ public: // Returns the stream associated to that task : any asynchronous operation // in the task body should be performed asynchronously with respect to that // CUDA stream - cudaStream_t get_stream() + cudaStream_t get_stream() const { const auto& e_place = get_exec_place(); if (e_place.is_grid()) @@ -89,7 +89,7 @@ public: } // TODO use a pos4 and check that we have a grid, of the proper dimension - cudaStream_t get_stream(size_t pos) + cudaStream_t get_stream(size_t pos) const { const auto& e_place = get_exec_place(); From aaf503f0b7929a4aa51e9b1f23c608dd0770e108 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 23:33:51 +0200 Subject: [PATCH 047/221] NUMBA interop --- .../stf/include/cccl/c/experimental/stf/stf.h | 4 +- c/experimental/stf/src/stf.cu | 13 +++ .../experimental/stf/_stf_bindings_impl.pyx | 79 ++++++++++++++++++- python/cuda_cccl/tests/stf/test_numba.py | 66 ++++++++++++++++ 4 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 python/cuda_cccl/tests/stf/test_numba.py diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 29a882d75e0..639b02b503f 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -43,8 +43,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol); void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); void stf_task_start(stf_task_handle t); void stf_task_end(stf_task_handle t); -cudaStream_t stf_task_get_stream(stf_task_handle t); -void* stf_task_get(stf_task_handle t, size_t submitted_index); +CUstream stf_task_get_custream(stf_task_handle t); +void* stf_task_get(stf_task_handle t, int submitted_index); void stf_task_destroy(stf_task_handle t); typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index df474582d21..2d31ea6907a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -99,6 +99,13 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); } +void* stf_task_get(stf_task_handle t, int index) +{ + assert(t); + auto s = t->t.template get>(index); + return (void*) s.data_handle(); +} + void stf_task_start(stf_task_handle t) { assert(t); @@ -111,6 +118,12 @@ void stf_task_end(stf_task_handle t) t->t.end(); } +CUstream stf_task_get_custream(stf_task_handle t) +{ + assert(t); + return (CUstream)t->t.get_stream(); +} + void stf_task_destroy(stf_task_handle t) { assert(t); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index a1f94b955cf..5825dcf4365 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -7,11 +7,16 @@ # static type checker tools like mypy green-lights cuda.cccl.parallel from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release +from cpython.buffer cimport Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyObject_GetBuffer, PyBuffer_Release from cpython.bytes cimport PyBytes_FromStringAndSize from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t from libc.stdint cimport uintptr_t from libc.string cimport memset, memcpy +import numpy as np +from numba import cuda + + from cpython.buffer cimport ( Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer @@ -33,6 +38,9 @@ cdef extern from "": ctypedef OpaqueCUkernel_st *CUkernel ctypedef OpaqueCUlibrary_st *CUlibrary +#typedef struct CUstream_st* cudaStream_t; + + cdef extern from "cccl/c/experimental/stf/stf.h": ctypedef struct stf_ctx_handle_t ctypedef stf_ctx_handle_t* stf_ctx_handle @@ -52,8 +60,9 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) void stf_task_start(stf_task_handle t) void stf_task_end(stf_task_handle t) + CUstream stf_task_get_custream(stf_task_handle t) # cudaStream_t stf_task_get_stream(stf_task_handle t) - void* stf_task_get(stf_task_handle t, size_t submitted_index) + void* stf_task_get(stf_task_handle t, int submitted_index) void stf_task_destroy(stf_task_handle t) cdef enum stf_access_mode: @@ -68,15 +77,39 @@ class AccessMode(IntFlag): WRITE = STF_WRITE RW = STF_RW +class stf_arg_cai: + def __init__(self, ptr, tuple shape, dtype, stream=0): + self.ptr = ptr # integer device pointer + self.shape = shape + self.dtype = np.dtype(dtype) + self.stream = stream # CUDA stream handle (int or 0) + self.__cuda_array_interface__ = { + 'version': 2, + 'shape': self.shape, + 'typestr': self.dtype.str, # e.g., 'view.shape[i] for i in range(view.ndim)) + self._dtype = np.dtype(view.format) stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len) finally: @@ -90,6 +123,18 @@ cdef class logical_data: stf_logical_data_destroy(self._ld) self._ld = NULL + @property + def dtype(self): + """Return the dtype of the logical data.""" + return self._dtype + + @property + def shape(self): + """Return the shape of the logical data.""" + return self._shape + + + class dep: __slots__ = ("ld", "mode") def __init__(self, logical_data ld, int mode): @@ -109,12 +154,18 @@ def rw(ld): return dep(ld, AccessMode.RW.value) cdef class task: cdef stf_task_handle _t + # list of logical data in deps: we need this because we can't exchange + # dtype/shape easily through the C API of STF + cdef list _lds_args + def __cinit__(self, context ctx): stf_task_create(ctx._ctx, &self._t) + self._lds_args = [] def __dealloc__(self): if self._t != NULL: stf_task_destroy(self._t) +# self._lds_args.clear() def start(self): stf_task_start(self._t) @@ -135,6 +186,28 @@ cdef class task: stf_task_add_dep(self._t, ldata._ld, mode_ce) + self._lds_args.append(ldata) + + def stream_ptr(self) -> int: + """ + Return the raw CUstream pointer as a Python int + (memory address). Suitable for ctypes or PyCUDA. + """ + cdef CUstream s = stf_task_get_custream(self._t) + return s # cast pointer -> Py int + + def get_arg(self, index) -> int: + cdef void *ptr = stf_task_get(self._t, index) + return ptr + + def get_arg_cai(self, index): + ptr = self.get_arg(index) + return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=0).__cuda_array_interface__ + + def get_arg_numba(self, index): + cai = self.get_arg_cai(index) + return cuda.from_cuda_array_interface(cai, owner=None, sync=False) + # ---- context‑manager helpers ------------------------------- def __enter__(self): self.start() diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py new file mode 100644 index 00000000000..07c724bb020 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write +import ctypes +import numpy as np +from numba import cuda +from numba.cuda.cudadrv import driver, devicearray + +@cuda.jit +def axpy(a, x, y): + i = cuda.grid(1) + if i < x.size: + y[i] = a * x[i] + y[i] + +@cuda.jit +def scale(a, x): + i = cuda.grid(1) + if i < x.size: + x[i] = a * x[i] + +def test_numba(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Y) + + with ctx.task(rw(lX)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + # dX = t.get_arg_numba(0) + dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + scale[32, 64, nb_stream](2.0, dX) + pass + + with ctx.task(read(lX), rw(lY)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + print(nb_stream) + dX = t.get_arg_numba(0) + dY = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dY) + pass + + with ctx.task(read(lX), rw(lZ)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dZ) + pass + + with ctx.task(read(lY), rw(lZ)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dY, dZ) + pass + + del ctx + +if __name__ == "__main__": + print("Running CUDASTF examples...") + test_numba() From f2f7dfb93e4b1e70f0a2ad4b820fecc7c1f47e95 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 24 Jul 2025 23:36:35 +0200 Subject: [PATCH 048/221] pre-commit --- c/experimental/stf/src/stf.cu | 2 +- python/cuda_cccl/tests/stf/test_numba.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 2d31ea6907a..d040f471195 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -121,7 +121,7 @@ void stf_task_end(stf_task_handle t) CUstream stf_task_get_custream(stf_task_handle t) { assert(t); - return (CUstream)t->t.get_stream(); + return (CUstream) t->t.get_stream(); } void stf_task_destroy(stf_task_handle t) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 07c724bb020..d40b1a45c30 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -2,11 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write -import ctypes import numpy as np from numba import cuda -from numba.cuda.cudadrv import driver, devicearray + +from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw + @cuda.jit def axpy(a, x, y): @@ -14,12 +14,14 @@ def axpy(a, x, y): if i < x.size: y[i] = a * x[i] + y[i] + @cuda.jit def scale(a, x): i = cuda.grid(1) if i < x.size: x[i] = a * x[i] + def test_numba(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) @@ -28,7 +30,7 @@ def test_numba(): ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) - lZ = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) with ctx.task(rw(lX)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -61,6 +63,7 @@ def test_numba(): del ctx + if __name__ == "__main__": print("Running CUDASTF examples...") test_numba() From 97c5f3a1aaa63b6cd79e663358e921ff452c8e30 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 25 Jul 2025 00:30:27 +0200 Subject: [PATCH 049/221] pre-commit --- .../experimental/stf/_stf_bindings_impl.pyx | 3 + python/cuda_cccl/tests/stf/test_numba.py | 78 ++++++++++++++++++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 5825dcf4365..811019231f5 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -227,6 +227,9 @@ cdef class context: stf_ctx_create(&self._ctx) def __dealloc__(self): + self.finalize() + + def finalize(self): if self._ctx != NULL: stf_ctx_finalize(self._ctx) self._ctx = NULL diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index d40b1a45c30..ce4d13e079f 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -5,7 +5,7 @@ import numpy as np from numba import cuda -from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw +from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw, write @cuda.jit @@ -61,9 +61,81 @@ def test_numba(): axpy[32, 64, nb_stream](2.0, dY, dZ) pass - del ctx +@cuda.jit +def laplacian_5pt_kernel(u_in, u_out, dx, dy): + """ + Compute a 5‑point Laplacian on u_in and write the result to u_out. + + Grid‑stride 2‑D kernel. Assumes C‑contiguous (row‑major) inputs. + Boundary cells are copied unchanged. + """ + coef_x = 1.0 / (dx * dx) + coef_y = 1.0 / (dy * dy) + + i, j = cuda.grid(2) # i ↔ row (x‑index), j ↔ col (y‑index) + nx, ny = u_in.shape + + if i >= nx or j >= ny: + return # out‑of‑bounds threads do nothing + + if 0 < i < nx - 1 and 0 < j < ny - 1: + u_out[i, j] = ( + (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + + (u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]) * coef_y + ) + else: + # simple Dirichlet/Neumann placeholder: copy input to output + u_out[i, j] = u_in[i, j] + +def test_numba2d(): + nx, ny = 1024, 1024 + dx = 2.0 * np.pi / (nx - 1) + dy = 2.0 * np.pi / (ny - 1) + + # a smooth test field: f(x,y) = sin(x) * cos(y) + x = np.linspace(0, 2*np.pi, nx, dtype=np.float64) + y = np.linspace(0, 2*np.pi, ny, dtype=np.float64) + + u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) + u_out = np.zeros_like(u) + + ctx = context() + lu = ctx.logical_data(u) + lu_out = ctx.logical_data(u_out) + with ctx.task(read(lu), write(lu_out)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + du = t.get_arg_numba(0) + du_out = t.get_arg_numba(1) + threads_per_block = (16, 16) # 256 threads per block is a solid starting point + blocks_per_grid = ( + (nx + threads_per_block[0] - 1) // threads_per_block[0], + (ny + threads_per_block[1] - 1) // threads_per_block[1], + ) + laplacian_5pt_kernel[blocks_per_grid, threads_per_block,nb_stream](du, du_out, dx, dy) + pass + + ctx.finalize() + + u_out_ref = np.zeros_like(u) + + for i in range(1, nx - 1): # skip boundaries + for j in range(1, ny - 1): + u_out_ref[i, j] = ( + (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + + (u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]) / dy**2 + ) + + # copy boundaries + u_out_ref[0, :] = u[0, :] + u_out_ref[-1, :] = u[-1, :] + u_out_ref[:, 0] = u[:, 0] + u_out_ref[:, -1] = u[:, -1] + + # compare with the GPU result + max_abs_diff = np.abs(u_out - u_out_ref).max() + print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") if __name__ == "__main__": print("Running CUDASTF examples...") - test_numba() + test_numba2d() From a5d669d39a3b148b598d5e2d62b322400c8bb844 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 25 Jul 2025 00:30:52 +0200 Subject: [PATCH 050/221] pre-commit and stencil test --- python/cuda_cccl/tests/stf/test_numba.py | 43 +++++++++++++----------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index ce4d13e079f..f6096c61b68 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -61,6 +61,7 @@ def test_numba(): axpy[32, 64, nb_stream](2.0, dY, dZ) pass + @cuda.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): """ @@ -72,31 +73,31 @@ def laplacian_5pt_kernel(u_in, u_out, dx, dy): coef_x = 1.0 / (dx * dx) coef_y = 1.0 / (dy * dy) - i, j = cuda.grid(2) # i ↔ row (x‑index), j ↔ col (y‑index) + i, j = cuda.grid(2) # i ↔ row (x‑index), j ↔ col (y‑index) nx, ny = u_in.shape if i >= nx or j >= ny: - return # out‑of‑bounds threads do nothing + return # out‑of‑bounds threads do nothing if 0 < i < nx - 1 and 0 < j < ny - 1: - u_out[i, j] = ( - (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + - (u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]) * coef_y - ) + u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + ( + u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1] + ) * coef_y else: # simple Dirichlet/Neumann placeholder: copy input to output u_out[i, j] = u_in[i, j] + def test_numba2d(): nx, ny = 1024, 1024 dx = 2.0 * np.pi / (nx - 1) dy = 2.0 * np.pi / (ny - 1) # a smooth test field: f(x,y) = sin(x) * cos(y) - x = np.linspace(0, 2*np.pi, nx, dtype=np.float64) - y = np.linspace(0, 2*np.pi, ny, dtype=np.float64) + x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64) + y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64) - u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) + u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) u_out = np.zeros_like(u) ctx = context() @@ -107,35 +108,37 @@ def test_numba2d(): nb_stream = cuda.external_stream(t.stream_ptr()) du = t.get_arg_numba(0) du_out = t.get_arg_numba(1) - threads_per_block = (16, 16) # 256 threads per block is a solid starting point + threads_per_block = (16, 16) # 256 threads per block is a solid starting point blocks_per_grid = ( (nx + threads_per_block[0] - 1) // threads_per_block[0], (ny + threads_per_block[1] - 1) // threads_per_block[1], ) - laplacian_5pt_kernel[blocks_per_grid, threads_per_block,nb_stream](du, du_out, dx, dy) - pass + laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream]( + du, du_out, dx, dy + ) + pass ctx.finalize() - u_out_ref = np.zeros_like(u) + u_out_ref = np.zeros_like(u) - for i in range(1, nx - 1): # skip boundaries + for i in range(1, nx - 1): # skip boundaries for j in range(1, ny - 1): - u_out_ref[i, j] = ( - (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + - (u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]) / dy**2 - ) + u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + ( + u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1] + ) / dy**2 # copy boundaries - u_out_ref[0, :] = u[0, :] + u_out_ref[0, :] = u[0, :] u_out_ref[-1, :] = u[-1, :] - u_out_ref[:, 0] = u[:, 0] + u_out_ref[:, 0] = u[:, 0] u_out_ref[:, -1] = u[:, -1] # compare with the GPU result max_abs_diff = np.abs(u_out - u_out_ref).max() print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") + if __name__ == "__main__": print("Running CUDASTF examples...") test_numba2d() From 686b9880eeadec8f83f443e0ebe5b86e656d20a0 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 25 Jul 2025 08:29:20 +0200 Subject: [PATCH 051/221] make it possible to create a graph_ctx --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 2 ++ c/experimental/stf/src/stf.cu | 8 ++++++++ .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 8 ++++++-- python/cuda_cccl/tests/stf/test_context.py | 5 +++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 639b02b503f..0e515b80eb7 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -18,6 +18,8 @@ typedef enum stf_access_mode typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); +// TODO stf_ctx_create_with_flags and an enum instead ? +void stf_ctx_create_graph(stf_ctx_handle* ctx); void stf_ctx_finalize(stf_ctx_handle ctx); // TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ... diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index d040f471195..3c305d04dd2 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -30,6 +30,14 @@ void stf_ctx_create(stf_ctx_handle* ctx) } } +void stf_ctx_create_graph(stf_ctx_handle* ctx) +{ + if (ctx) + { + *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; + } +} + void stf_ctx_finalize(stf_ctx_handle ctx) { ctx->ctx.finalize(); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 811019231f5..9597d199d33 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -45,6 +45,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": ctypedef struct stf_ctx_handle_t ctypedef stf_ctx_handle_t* stf_ctx_handle void stf_ctx_create(stf_ctx_handle* ctx) + void stf_ctx_create_graph(stf_ctx_handle* ctx) void stf_ctx_finalize(stf_ctx_handle ctx) ctypedef struct stf_logical_data_handle_t @@ -223,8 +224,11 @@ cdef class task: cdef class context: cdef stf_ctx_handle _ctx - def __cinit__(self): - stf_ctx_create(&self._ctx) + def __cinit__(self, bint use_graph=False): + if use_graph: + stf_ctx_create_graph(&self._ctx) + else: + stf_ctx_create(&self._ctx) def __dealloc__(self): self.finalize() diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index 79f443ac2cd..5a29199dc67 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -12,6 +12,11 @@ def test_ctx(): del ctx +def test_graph_ctx(): + ctx = context(use_graph=True) + ctx.finalize() + + def test_ctx2(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) From b4688fdd76a7f69adecc6458f56dbd55ec0178e9 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 2 Aug 2025 12:41:40 +0200 Subject: [PATCH 052/221] implement set_exec_place for cuda_kernel and unified tasks --- .../experimental/__stf/internal/context.cuh | 32 +++++++++++++++++++ .../__stf/internal/cuda_kernel_scope.cuh | 6 ++++ 2 files changed, 38 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index ff6b69859b4..6487356f61d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -94,6 +94,22 @@ class context }; } + auto&& set_exec_place(exec_place e_place) & + { + payload->*[&](auto& self) { + self.set_exec_place(mv(e_place)); + }; + return *this; + } + + auto&& set_exec_place(exec_place e_place) && + { + payload->*[&](auto& self) { + self.set_exec_place(mv(e_place)); + }; + return mv(*this); + } + auto& set_symbol(::std::string s) & { payload->*[&](auto& self) { @@ -195,6 +211,22 @@ public: return mv(*this); } + auto&& set_exec_place(exec_place e_place) & + { + payload->*[&](auto& self) { + self.set_exec_place(mv(e_place)); + }; + return *this; + } + + auto&& set_exec_place(exec_place e_place) && + { + payload->*[&](auto& self) { + self.set_exec_place(mv(e_place)); + }; + return mv(*this); + } + auto& start() { payload->*[&](auto& self) { diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index f22ec6a5da0..66b54d4becc 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -299,6 +299,12 @@ public: // move-constructible cuda_kernel_scope(cuda_kernel_scope&&) = default; + auto& set_exec_place(exec_place e_place_) + { + e_place = mv(e_place_); + return *this; + } + /// Add a set of dependencies template void add_deps(task_dep_untyped first, Pack&&... pack) From bd474d6ca8025121637f401f639c81f7584ea7f9 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 2 Aug 2025 12:42:10 +0200 Subject: [PATCH 053/221] Define some execution places in the C API --- .../stf/include/cccl/c/experimental/stf/stf.h | 44 +++++++++++++++++++ c/experimental/stf/src/stf.cu | 24 ++++++++++ 2 files changed, 68 insertions(+) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 0e515b80eb7..592c3bb8ac1 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -15,6 +15,48 @@ typedef enum stf_access_mode STF_RW = STF_READ | STF_WRITE } stf_access_mode; +struct stf_exec_place_device +{ + int dev_id; +}; + +struct stf_exec_place_host +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +typedef enum stf_exec_place_kind +{ + STF_EXEC_PLACE_DEVICE, + STF_EXEC_PLACE_HOST +} stf_exec_place_kind; + +struct stf_exec_place +{ + enum stf_exec_place_kind kind; + union + { + struct stf_exec_place_device device; + struct stf_exec_place_host host; + } u; +}; + +static inline struct stf_exec_place make_device_place(int dev_id) +{ + struct stf_exec_place p; + p.kind = STF_EXEC_PLACE_DEVICE; + p.u.device.dev_id = dev_id; + return p; +} + +static inline struct stf_exec_place make_host_place() +{ + struct stf_exec_place p; + p.kind = STF_EXEC_PLACE_HOST; + p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); @@ -41,6 +83,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); typedef struct stf_task_handle_t* stf_task_handle; void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); void stf_task_set_symbol(stf_task_handle t, const char* symbol); void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); void stf_task_start(stf_task_handle t); @@ -52,6 +95,7 @@ void stf_task_destroy(stf_task_handle t); typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); void stf_cuda_kernel_start(stf_cuda_kernel_handle k); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 3c305d04dd2..a9e3e019734 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -83,6 +83,18 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) *ld = new stf_logical_data_handle_t{ctx->ctx.token()}; } +/* Convert the C-API stf_exec_place to a C++ exec_place object */ +exec_place to_exec_place(stf_exec_place* exec_p) +{ + if (exec_p->kind == STF_EXEC_PLACE_HOST) + { + return exec_place::host(); + } + + assert(exec_p->kind == STF_EXEC_PLACE_DEVICE); + return exec_place::device(exec_p->u.device.dev_id); +} + void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) { assert(t); @@ -91,6 +103,12 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) *t = new stf_task_handle_t{ctx->ctx.task()}; } +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) +{ + assert(t); + t->t.set_exec_place(to_exec_place(exec_p)); +} + void stf_task_set_symbol(stf_task_handle t, const char* symbol) { assert(t); @@ -170,6 +188,12 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; } +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p) +{ + assert(k); + k->k.set_exec_place(to_exec_place(exec_p)); +} + void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) { assert(k); From 0b6e93a5dd04088eca1787466d75016a7ae884dc Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 2 Aug 2025 19:43:59 +0200 Subject: [PATCH 054/221] WIP: start to support execution places --- .../stf/include/cccl/c/experimental/stf/stf.h | 9 +- c/experimental/stf/src/stf.cu | 6 +- .../experimental/stf/_stf_bindings_impl.pyx | 82 ++++++++++++++++++- python/cuda_cccl/tests/stf/test_numba.py | 50 ++++++++++- 4 files changed, 137 insertions(+), 10 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 592c3bb8ac1..97cf89e5261 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -57,6 +57,11 @@ static inline struct stf_exec_place make_host_place() return p; } +typedef struct stf_exec_place_device stf_exec_place_device; +typedef struct stf_exec_place_host stf_exec_place_host; +typedef union stf_exec_place_u stf_exec_place_u; +typedef struct stf_exec_place stf_exec_place; + typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); @@ -83,7 +88,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); typedef struct stf_task_handle_t* stf_task_handle; void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); -void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); +void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p); void stf_task_set_symbol(stf_task_handle t, const char* symbol); void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); void stf_task_start(stf_task_handle t); @@ -95,7 +100,7 @@ void stf_task_destroy(stf_task_handle t); typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p); void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); void stf_cuda_kernel_start(stf_cuda_kernel_handle k); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index a9e3e019734..a879b8f8859 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -84,7 +84,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) } /* Convert the C-API stf_exec_place to a C++ exec_place object */ -exec_place to_exec_place(stf_exec_place* exec_p) +exec_place to_exec_place(struct stf_exec_place* exec_p) { if (exec_p->kind == STF_EXEC_PLACE_HOST) { @@ -103,7 +103,7 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) *t = new stf_task_handle_t{ctx->ctx.task()}; } -void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) +void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p) { assert(t); t->t.set_exec_place(to_exec_place(exec_p)); @@ -188,7 +188,7 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; } -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p) +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p) { assert(k); k->k.set_exec_place(to_exec_place(exec_p)); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 9597d199d33..eb5e6b48252 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -42,12 +42,39 @@ cdef extern from "": cdef extern from "cccl/c/experimental/stf/stf.h": + # + # Contexts + # ctypedef struct stf_ctx_handle_t ctypedef stf_ctx_handle_t* stf_ctx_handle void stf_ctx_create(stf_ctx_handle* ctx) void stf_ctx_create_graph(stf_ctx_handle* ctx) void stf_ctx_finalize(stf_ctx_handle ctx) + # + # Exec places + # + ctypedef enum stf_exec_place_kind: + STF_EXEC_PLACE_DEVICE + STF_EXEC_PLACE_HOST + + ctypedef struct stf_exec_place_device: + int dev_id + + ctypedef struct stf_exec_place_host: + int dummy + + ctypedef union stf_exec_place_u: + stf_exec_place_device device + stf_exec_place_host host + + ctypedef struct stf_exec_place: + stf_exec_place_kind kind + stf_exec_place_u u + + stf_exec_place make_device_place(int dev_id) + stf_exec_place make_host_place() + ctypedef struct stf_logical_data_handle_t ctypedef stf_logical_data_handle_t* stf_logical_data_handle void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) @@ -57,6 +84,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": ctypedef struct stf_task_handle_t ctypedef stf_task_handle_t* stf_task_handle void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) + void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) void stf_task_set_symbol(stf_task_handle t, const char* symbol) void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) void stf_task_start(stf_task_handle t) @@ -152,6 +180,36 @@ def read(ld): return dep(ld, AccessMode.READ.value) def write(ld): return dep(ld, AccessMode.WRITE.value) def rw(ld): return dep(ld, AccessMode.RW.value) +cdef class ExecPlace: + cdef stf_exec_place _c_place + + def __cinit__(self): + # empty default constructor; never directly used + pass + + @staticmethod + def device(int dev_id): + cdef ExecPlace p = ExecPlace.__new__(ExecPlace) + p._c_place = make_device_place(dev_id) + return p + + @staticmethod + def host(): + cdef ExecPlace p = ExecPlace.__new__(ExecPlace) + p._c_place = make_host_place() + return p + + @property + def kind(self) -> str: + return ("device" if self._c_place.kind == STF_EXEC_PLACE_DEVICE + else "host") + + @property + def device_id(self) -> int: + if self._c_place.kind != STF_EXEC_PLACE_DEVICE: + raise AttributeError("not a device execution place") + return self._c_place.u.device.dev_id + cdef class task: cdef stf_task_handle _t @@ -189,6 +247,13 @@ cdef class task: self._lds_args.append(ldata) + def set_exec_place(self, object exec_p): + if not isinstance(exec_p, ExecPlace): + raise TypeError("set_exec_place expects and ExecPlace argument") + + cdef ExecPlace ep = exec_p + stf_task_set_exec_place(self._t, &ep._c_place) + def stream_ptr(self) -> int: """ Return the raw CUstream pointer as a Python int @@ -249,7 +314,7 @@ cdef class context: """ return logical_data(self, buf) - def task(self, *deps): + def task(self, *args): """ Create a `task` @@ -259,7 +324,18 @@ cdef class context: >>> t.start() >>> t.end() """ + exec_place_set = False t = task(self) # construct with this context - for d in deps: - t.add_dep(d) # your existing add_dep logic + for d in args: + if isinstance(d, dep): + t.add_dep(d) + elif isinstance(d, ExecPlace): + if exec_place_set: + raise ValueError("Only one ExecPlace can be given") + t.set_exec_place(d) + exec_place_set = True + else: + raise TypeError( + "Arguments must be dependency objects or an ExecPlace" + ) return t diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index f6096c61b68..3420036642b 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -5,7 +5,13 @@ import numpy as np from numba import cuda -from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw, write +from cuda.cccl.experimental.stf._stf_bindings_impl import ( + ExecPlace, + context, + read, + rw, + write, +) @cuda.jit @@ -139,6 +145,46 @@ def test_numba2d(): print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") +def test_numba_exec_place(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(ExecPlace.device(0), rw(lX)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + # dX = t.get_arg_numba(0) + dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + scale[32, 64, nb_stream](2.0, dX) + pass + + with ctx.task(ExecPlace.device(0), read(lX), rw(lY)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + print(nb_stream) + dX = t.get_arg_numba(0) + dY = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dY) + pass + + with ctx.task(ExecPlace.device(0), read(lX), rw(lZ)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dZ) + pass + + with ctx.task(ExecPlace.device(0), read(lY), rw(lZ)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dY, dZ) + pass + + if __name__ == "__main__": print("Running CUDASTF examples...") - test_numba2d() + test_numba_exec_place() From ff9d70af26e8a904e65df8ae58e4905eed67ee82 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 00:26:50 +0200 Subject: [PATCH 055/221] set_exec_place should also set the data place --- cudax/include/cuda/experimental/__stf/internal/task.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh index d5ac78a1b8b..93d2e330c3d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/task.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh @@ -271,13 +271,16 @@ public: { return pimpl->e_place; } + exec_place& get_exec_place() { return pimpl->e_place; } + void set_exec_place(const exec_place& place) { - pimpl->e_place = place; + // This will both update the execution place and the affine data place + on(place); } /// Get and Set the affine data place of the task From c610c42c8a70b1832aaf50617921a077c57578c5 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 01:09:52 +0200 Subject: [PATCH 056/221] rename ExecPlace to exec_place --- .../experimental/stf/_stf_bindings_impl.pyx | 18 +++++++++--------- python/cuda_cccl/tests/stf/test_numba.py | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index eb5e6b48252..71767f84253 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -180,7 +180,7 @@ def read(ld): return dep(ld, AccessMode.READ.value) def write(ld): return dep(ld, AccessMode.WRITE.value) def rw(ld): return dep(ld, AccessMode.RW.value) -cdef class ExecPlace: +cdef class exec_place: cdef stf_exec_place _c_place def __cinit__(self): @@ -189,13 +189,13 @@ cdef class ExecPlace: @staticmethod def device(int dev_id): - cdef ExecPlace p = ExecPlace.__new__(ExecPlace) + cdef exec_place p = exec_place.__new__(exec_place) p._c_place = make_device_place(dev_id) return p @staticmethod def host(): - cdef ExecPlace p = ExecPlace.__new__(ExecPlace) + cdef exec_place p = exec_place.__new__(exec_place) p._c_place = make_host_place() return p @@ -248,10 +248,10 @@ cdef class task: self._lds_args.append(ldata) def set_exec_place(self, object exec_p): - if not isinstance(exec_p, ExecPlace): - raise TypeError("set_exec_place expects and ExecPlace argument") + if not isinstance(exec_p, exec_place): + raise TypeError("set_exec_place expects and exec_place argument") - cdef ExecPlace ep = exec_p + cdef exec_place ep = exec_p stf_task_set_exec_place(self._t, &ep._c_place) def stream_ptr(self) -> int: @@ -329,13 +329,13 @@ cdef class context: for d in args: if isinstance(d, dep): t.add_dep(d) - elif isinstance(d, ExecPlace): + elif isinstance(d, exec_place): if exec_place_set: - raise ValueError("Only one ExecPlace can be given") + raise ValueError("Only one exec_place can be given") t.set_exec_place(d) exec_place_set = True else: raise TypeError( - "Arguments must be dependency objects or an ExecPlace" + "Arguments must be dependency objects or an exec_place" ) return t diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 3420036642b..cf18b447c39 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -6,8 +6,8 @@ from numba import cuda from cuda.cccl.experimental.stf._stf_bindings_impl import ( - ExecPlace, context, + exec_place, read, rw, write, @@ -155,14 +155,14 @@ def test_numba_exec_place(): lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - with ctx.task(ExecPlace.device(0), rw(lX)) as t: + with ctx.task(exec_place.device(0), rw(lX)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) # dX = t.get_arg_numba(0) dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) pass - with ctx.task(ExecPlace.device(0), read(lX), rw(lY)) as t: + with ctx.task(exec_place.device(0), read(lX), rw(lY)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) print(nb_stream) dX = t.get_arg_numba(0) @@ -170,14 +170,14 @@ def test_numba_exec_place(): axpy[32, 64, nb_stream](2.0, dX, dY) pass - with ctx.task(ExecPlace.device(0), read(lX), rw(lZ)) as t: + with ctx.task(exec_place.device(0), read(lX), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) pass - with ctx.task(ExecPlace.device(0), read(lY), rw(lZ)) as t: + with ctx.task(exec_place.device(0), read(lY), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) From f65702b7270b03ece2f76170e56a5ea3469e168d Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 02:02:28 +0200 Subject: [PATCH 057/221] Save WIP: start to implement data places (not compiling yet) --- .../stf/include/cccl/c/experimental/stf/stf.h | 81 +++++++++++++++++++ c/experimental/stf/src/stf.cu | 34 ++++++++ .../experimental/stf/_stf_bindings_impl.pyx | 57 ++++++++++++- python/cuda_cccl/tests/stf/test_numba.py | 8 +- 4 files changed, 172 insertions(+), 8 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 97cf89e5261..caa8769a8b6 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -62,6 +62,85 @@ typedef struct stf_exec_place_host stf_exec_place_host; typedef union stf_exec_place_u stf_exec_place_u; typedef struct stf_exec_place stf_exec_place; +struct stf_data_place_device +{ + int dev_id; +}; + +struct stf_data_place_host +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +struct stf_data_place_managed +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +struct stf_data_place_affine +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +typedef enum stf_data_place_kind +{ + STF_DATA_PLACE_DEVICE, + STF_DATA_PLACE_HOST, + STF_DATA_PLACE_MANAGED, + STF_DATA_PLACE_AFFINE +} stf_data_place_kind; + +struct stf_data_place +{ + enum stf_data_place_kind kind; + union + { + struct stf_data_place_device device; + struct stf_data_place_host host; + struct stf_data_place_managed managed; + struct stf_data_place_affine affine; + } u; +}; + +static inline struct stf_data_place make_device_data_place(int dev_id) +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_DEVICE; + p.u.device.dev_id = dev_id; + return p; +} + +static inline struct stf_data_place make_host_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_HOST; + p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +static inline struct stf_data_place make_managed_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_MANAGED; + p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +static inline struct stf_data_place make_affine_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_AFFINE; + p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +typedef struct stf_data_place_device stf_data_place_device; +typedef struct stf_data_place_host stf_data_place_host; +typedef struct stf_data_place_managed stf_data_place_managed; +typedef struct stf_data_place_affine stf_data_place_affine; +typedef union stf_data_place_u stf_data_place_u; +typedef struct stf_data_place stf_data_place; + typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); @@ -91,6 +170,8 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p); void stf_task_set_symbol(stf_task_handle t, const char* symbol); void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); +void stf_task_add_dep_with_dplace( + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p); void stf_task_start(stf_task_handle t); void stf_task_end(stf_task_handle t); CUstream stf_task_get_custream(stf_task_handle t); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index a879b8f8859..33b4cf92489 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -95,6 +95,30 @@ exec_place to_exec_place(struct stf_exec_place* exec_p) return exec_place::device(exec_p->u.device.dev_id); } +/* Convert the C-API stf_data_place to a C++ data_place object */ +data_place to_data_place(struct stf_data_place* data_p) +{ + assert(data_p); + + if (data_p->kind == STF_DATA_PLACE_HOST) + { + return data_place::host(); + } + + if (data_p->kind == STF_DATA_PLACE_MANAGED) + { + return data_place::managed(); + } + + if (data_p->kind == STF_DATA_PLACE_AFFINE) + { + return data_place::affine(); + } + + assert(data_p->kind == STF_DATA_PLACE_DEVICE); + return data_place::device(data_p->u.device.dev_id); +} + void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) { assert(t); @@ -125,6 +149,16 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); } +void stf_task_add_dep_with_dplace( + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p) +{ + assert(t); + assert(ld); + assert(data_p); + + t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p))); +} + void* stf_task_get(stf_task_handle t, int index) { assert(t); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 71767f84253..85ffb87aae6 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -75,6 +75,42 @@ cdef extern from "cccl/c/experimental/stf/stf.h": stf_exec_place make_device_place(int dev_id) stf_exec_place make_host_place() + # + # Data places + # + ctypedef enum stf_data_place_kind: + STF_DATA_PLACE_DEVICE + STF_DATA_PLACE_HOST + STF_DATA_PLACE_MANAGED + STF_DATA_PLACE_AFFINE + + ctypedef struct stf_data_place_device: + int dev_id + + ctypedef struct stf_data_place_host: + int dummy + + ctypedef struct stf_data_place_managed: + int dummy + + ctypedef struct stf_data_place_affine: + int dummy + + ctypedef union stf_data_place_u: + stf_data_place_device device + stf_data_place_host host + stf_data_place_managed managed + stf_data_place_affine affine + + ctypedef struct stf_data_place: + stf_data_place_kind kind + stf_data_place_u u + + stf_data_place make_device_data_place(int dev_id) + stf_data_place make_host_data_place() + stf_data_place make_managed_data_place() + stf_data_place make_affine_data_place() + ctypedef struct stf_logical_data_handle_t ctypedef stf_logical_data_handle_t* stf_logical_data_handle void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) @@ -87,6 +123,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) void stf_task_set_symbol(stf_task_handle t, const char* symbol) void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) + void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p) void stf_task_start(stf_task_handle t) void stf_task_end(stf_task_handle t) CUstream stf_task_get_custream(stf_task_handle t) @@ -162,20 +199,28 @@ cdef class logical_data: """Return the shape of the logical data.""" return self._shape + def read(self): + return dep(self, AccessMode.READ.value) + def write(self): + return dep(self, AccessMode.WRITE.value) + + def rw(self): + return dep(self, AccessMode.RW.value) class dep: __slots__ = ("ld", "mode") - def __init__(self, logical_data ld, int mode): + def __init__(self, logical_data ld, int mode, dplace=None): self.ld = ld self.mode = mode + self.dplace = dplace # can be None or a data place def __iter__(self): # nice unpacking support yield self.ld yield self.mode + yield self.dplace def __repr__(self): - return f"dep({self.ld!r}, {self.mode})" + return f"dep({self.ld!r}, {self.mode}, {self.place!r})" -# optional sugar def read(ld): return dep(ld, AccessMode.READ.value) def write(ld): return dep(ld, AccessMode.WRITE.value) def rw(ld): return dep(ld, AccessMode.RW.value) @@ -243,7 +288,11 @@ cdef class task: cdef int mode_int = int(d.mode) cdef stf_access_mode mode_ce = mode_int - stf_task_add_dep(self._t, ldata._ld, mode_ce) + if d.dplace is None: + stf_task_add_dep(self._t, ldata._ld, mode_ce) + else: + cdef stf_data_place dplace = d.dplace + stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dplace) self._lds_args.append(ldata) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index cf18b447c39..16fac79c3b0 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -155,14 +155,14 @@ def test_numba_exec_place(): lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - with ctx.task(exec_place.device(0), rw(lX)) as t: + with ctx.task(exec_place.device(0), lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) # dX = t.get_arg_numba(0) dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) pass - with ctx.task(exec_place.device(0), read(lX), rw(lY)) as t: + with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) print(nb_stream) dX = t.get_arg_numba(0) @@ -170,14 +170,14 @@ def test_numba_exec_place(): axpy[32, 64, nb_stream](2.0, dX, dY) pass - with ctx.task(exec_place.device(0), read(lX), rw(lZ)) as t: + with ctx.task(exec_place.device(0), lX.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) pass - with ctx.task(exec_place.device(0), read(lY), rw(lZ)) as t: + with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) From 21c94a6100915b37a46a1bda7764d3bd02365377 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 08:53:11 +0200 Subject: [PATCH 058/221] fix data places --- .../experimental/stf/_stf_bindings_impl.pyx | 58 ++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 85ffb87aae6..46a8fcb37d3 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -255,6 +255,59 @@ cdef class exec_place: raise AttributeError("not a device execution place") return self._c_place.u.device.dev_id +cdef class data_place: + cdef stf_data_place _c_place + + def __cinit__(self): + # empty default constructor; never directly used + pass + + @staticmethod + def device(int dev_id): + cdef data_place p = data_place.__new__(exec_place) + p._c_place = make_device_data_place(dev_id) + return p + + @staticmethod + def host(): + cdef data_place p = data_place.__new__(exec_place) + p._c_place = make_host_data_place() + return p + + @staticmethod + def managed(): + cdef data_place p = data_place.__new__(exec_place) + p._c_place = make_managed_data_place() + return p + + @staticmethod + def affine(): + cdef data_place p = data_place.__new__(exec_place) + p._c_place = make_affine_data_place() + return p + + @property + def kind(self) -> str: + cdef stf_data_place_kind k = self._c_place.kind + if k == STF_DATA_PLACE_DEVICE: + return "device" + elif k == STF_DATA_PLACE_HOST: + return "host" + elif k == STF_DATA_PLACE_MANAGED: + return "managed" + elif k == STF_DATA_PLACE_AFFINE: + return "affine" + else: + raise ValueError(f"Unknown data place kind: {k}") + + @property + def device_id(self) -> int: + if self._c_place.kind != STF_DATA_PLACE_DEVICE: + raise AttributeError("not a device data place") + return self._c_place.u.device.dev_id + + + cdef class task: cdef stf_task_handle _t @@ -287,12 +340,13 @@ cdef class task: cdef logical_data ldata = d.ld cdef int mode_int = int(d.mode) cdef stf_access_mode mode_ce = mode_int + cdef data_place dp if d.dplace is None: stf_task_add_dep(self._t, ldata._ld, mode_ce) else: - cdef stf_data_place dplace = d.dplace - stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dplace) + dp = d.dplace + stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dp._c_place) self._lds_args.append(ldata) From f863ecd97e3713287718365ff4301d9d6939fea4 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 09:02:30 +0200 Subject: [PATCH 059/221] Add data places in deps --- .../experimental/stf/_stf_bindings_impl.pyx | 28 +++++++++---------- python/cuda_cccl/tests/stf/test_numba.py | 5 +++- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 46a8fcb37d3..85585b507ee 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -199,17 +199,17 @@ cdef class logical_data: """Return the shape of the logical data.""" return self._shape - def read(self): - return dep(self, AccessMode.READ.value) + def read(self, dplace=None): + return dep(self, AccessMode.READ.value, dplace) - def write(self): - return dep(self, AccessMode.WRITE.value) + def write(self, dplace=None): + return dep(self, AccessMode.WRITE.value, dplace) - def rw(self): - return dep(self, AccessMode.RW.value) + def rw(self, dplace=None): + return dep(self, AccessMode.RW.value, dplace) class dep: - __slots__ = ("ld", "mode") + __slots__ = ("ld", "mode", "dplace") def __init__(self, logical_data ld, int mode, dplace=None): self.ld = ld self.mode = mode @@ -221,9 +221,9 @@ class dep: def __repr__(self): return f"dep({self.ld!r}, {self.mode}, {self.place!r})" -def read(ld): return dep(ld, AccessMode.READ.value) -def write(ld): return dep(ld, AccessMode.WRITE.value) -def rw(ld): return dep(ld, AccessMode.RW.value) +def read(ld, dplace=None): return dep(ld, AccessMode.READ.value, dplace) +def write(ld, dplace=None): return dep(ld, AccessMode.WRITE.value, dplace) +def rw(ld, dplace=None): return dep(ld, AccessMode.RW.value, dplace) cdef class exec_place: cdef stf_exec_place _c_place @@ -264,25 +264,25 @@ cdef class data_place: @staticmethod def device(int dev_id): - cdef data_place p = data_place.__new__(exec_place) + cdef data_place p = data_place.__new__(data_place) p._c_place = make_device_data_place(dev_id) return p @staticmethod def host(): - cdef data_place p = data_place.__new__(exec_place) + cdef data_place p = data_place.__new__(data_place) p._c_place = make_host_data_place() return p @staticmethod def managed(): - cdef data_place p = data_place.__new__(exec_place) + cdef data_place p = data_place.__new__(data_place) p._c_place = make_managed_data_place() return p @staticmethod def affine(): - cdef data_place p = data_place.__new__(exec_place) + cdef data_place p = data_place.__new__(data_place) p._c_place = make_affine_data_place() return p diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 16fac79c3b0..d3292e2fe16 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -7,6 +7,7 @@ from cuda.cccl.experimental.stf._stf_bindings_impl import ( context, + data_place, exec_place, read, rw, @@ -170,7 +171,9 @@ def test_numba_exec_place(): axpy[32, 64, nb_stream](2.0, dX, dY) pass - with ctx.task(exec_place.device(0), lX.read(), lZ.rw()) as t: + with ctx.task( + exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed()) + ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) From 11b66735b57f6f03d7f25ed56c6267f40777f8a0 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 3 Aug 2025 09:29:01 +0200 Subject: [PATCH 060/221] test with places --- python/cuda_cccl/tests/stf/test_numba.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index d3292e2fe16..3a565b0a8d7 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -188,6 +188,45 @@ def test_numba_exec_place(): pass +def test_numba_places(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(lX.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = t.get_arg_numba(0) + scale[32, 64, nb_stream](2.0, dX) + pass + + with ctx.task(lX.read(), lY.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + print(nb_stream) + dX = t.get_arg_numba(0) + dY = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dY) + pass + + with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dX, dZ) + pass + + with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY = t.get_arg_numba(0) + dZ = t.get_arg_numba(1) + axpy[32, 64, nb_stream](2.0, dY, dZ) + pass + + if __name__ == "__main__": print("Running CUDASTF examples...") test_numba_exec_place() From e422712ccdb012e04fc8585eb14957664a42566a Mon Sep 17 00:00:00 2001 From: root Date: Mon, 4 Aug 2025 21:14:58 +0000 Subject: [PATCH 061/221] fix previous merge --- .../experimental/__stf/internal/cuda_kernel_scope.cuh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index b6c5864aec6..d06bfec0fdd 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -58,13 +58,7 @@ struct cuda_kernel_desc template cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) { - configure(mv(func), gridDim_, blockDim_, sharedMem_, ::std::forward(args)...); - } - - template - void configure(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) - { - configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)...); + configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)); } template From 1bb8b4378770b984cee5f8ba51af24eee13d73a8 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 4 Aug 2025 21:19:16 +0000 Subject: [PATCH 062/221] typo fix --- .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index d06bfec0fdd..809b217237d 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -58,7 +58,7 @@ struct cuda_kernel_desc template cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args) { - configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)); + configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)...); } template From fc8d5eb4ea870b3aa13b5b05667b551e84e01167 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 4 Aug 2025 22:19:27 +0000 Subject: [PATCH 063/221] Save WIP: try to implement a new decorator for STF --- .../cuda/cccl/experimental/stf/__init__.py | 17 +++++ .../cuda/cccl/experimental/stf/decorator.py | 63 +++++++++++++++++++ python/cuda_cccl/tests/stf/test_decorator.py | 29 +++++++++ 3 files changed, 109 insertions(+) create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py create mode 100644 python/cuda_cccl/tests/stf/test_decorator.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py new file mode 100644 index 00000000000..bef32849b2a --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py @@ -0,0 +1,17 @@ +from ._stf_bindings_impl import ( + context, + dep, + exec_place, + data_place, +) + +from .decorator import jit # Python-side kernel launcher + +__all__ = [ + "context", + "dep", + "exec_place", + "data_place", + "jit", +] + diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py new file mode 100644 index 00000000000..cd31c3d746d --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -0,0 +1,63 @@ +from numba import cuda +from cuda.cccl.experimental.stf import context, dep, exec_place + + +class _CudaSTFKernel: + def __init__(self, numba_kernel): + self._nkern = numba_kernel + self._launch_cfg = None # (gridDim, blockDim, context, exec_place?) + + def __getitem__(self, cfg): + if not (len(cfg) == 3 or len(cfg) == 4): + raise TypeError("use kernel[gridDim, blockDim, ctx (, exec_place)]") + + gridDim, blockDim, ctx, *rest = cfg + if not isinstance(ctx, context): + raise TypeError("3rd item must be an STF context") + + exec_pl = rest[0] if rest else None + if exec_pl and not isinstance(exec_pl, exec_place): + raise TypeError("4th item must be an exec_place") + + self._launch_cfg = (int(gridDim), int(blockDim), ctx, exec_pl) + return self + + def __call__(self, *args, **kwargs): + if self._launch_cfg is None: + raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](…)") + + gridDim, blockDim, ctx, exec_pl = self._launch_cfg + + dep_items = [(i, a) for i, a in enumerate(args) if isinstance(a, dep)] + if not dep_items: + raise TypeError("at least one argument must be an STF dep") + + task_args = [exec_pl] if exec_pl else [] + task_args.extend(a for _, a in dep_items) + + with ctx.task(*task_args) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dev_args = list(args) + for dep_index, (pos, _) in enumerate(dep_items): + dev_args[pos] = t.get_arg_numba(dep_index) + + self._nkern[gridDim, blockDim, nb_stream](*dev_args, **kwargs) + + return None + + +def jit(*jit_args, **jit_kwargs): + if jit_args and callable(jit_args[0]): + pyfunc = jit_args[0] + return _build_kernel(pyfunc, (), **jit_kwargs) + + def _decorator(fn): + return _build_kernel(fn, jit_args, **jit_kwargs) + + return _decorator + + +def _build_kernel(pyfunc, jit_args, **jit_kwargs): + numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc) + return _CudaSTFKernel(numba_kernel) + diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py new file mode 100644 index 00000000000..269a7ebb70c --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -0,0 +1,29 @@ +import numpy as np +from numba import cuda +import cuda.cccl.experimental.stf as cudastf + + +@cudastf.jit +def axpy(a, x, y): + i = cuda.grid(1) + if i < x.size: + y[i] = a * x[i] + y[i] + +@cudastf.jit +def scale(a, x): + i = cuda.grid(1) + if i < x.size: + x[i] = a * x[i] + +X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) + +ctx = cudastf.context() +lX = ctx.logical_data(X) +lY = ctx.logical_data(Y) +lZ = ctx.logical_data(Z) + +scale[32, 64, ctx](2.0, lX.rw()) +axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device +axpy[32, 64, ctx, exec_place.device(1)](2.0, lX.read(), lZ.rw()) # explicit exec place +axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(data_place.device(1))) # per-dep placement override + From 167f6c57344e971b25e5ff486c63f61cd30807c3 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 4 Aug 2025 22:39:57 +0000 Subject: [PATCH 064/221] fix typo --- .../cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 85585b507ee..01e3e2f0132 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -219,7 +219,7 @@ class dep: yield self.mode yield self.dplace def __repr__(self): - return f"dep({self.ld!r}, {self.mode}, {self.place!r})" + return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})" def read(ld, dplace=None): return dep(ld, AccessMode.READ.value, dplace) def write(ld, dplace=None): return dep(ld, AccessMode.WRITE.value, dplace) From 95104efa1f1fade13f651fdf9e9c4dc9d669b426 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 5 Aug 2025 08:08:29 +0000 Subject: [PATCH 065/221] Defer compilation until we know types --- .../cuda/cccl/experimental/stf/decorator.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index cd31c3d746d..10b13b8ca4b 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -3,8 +3,11 @@ class _CudaSTFKernel: - def __init__(self, numba_kernel): - self._nkern = numba_kernel + def __init__(self, pyfunc, jit_args, jit_kwargs): + self._pyfunc = pyfunc + self._jit_args = jit_args + self._jit_kwargs = jit_kwargs + self._compiled_kernel = None self._launch_cfg = None # (gridDim, blockDim, context, exec_place?) def __getitem__(self, cfg): @@ -28,20 +31,28 @@ def __call__(self, *args, **kwargs): gridDim, blockDim, ctx, exec_pl = self._launch_cfg - dep_items = [(i, a) for i, a in enumerate(args) if isinstance(a, dep)] - if not dep_items: - raise TypeError("at least one argument must be an STF dep") + dep_items = [] + for i, a in enumerate(args): + print(f'got one arg {a} is dep ? {isinstance(a, dep)}') + if isinstance(a, dep): + dep_items.append((i, a)) task_args = [exec_pl] if exec_pl else [] task_args.extend(a for _, a in dep_items) with ctx.task(*task_args) as t: - nb_stream = cuda.external_stream(t.stream_ptr()) dev_args = list(args) + print(dev_args) for dep_index, (pos, _) in enumerate(dep_items): + print(f'set arg {dep_index} at position {pos}') dev_args[pos] = t.get_arg_numba(dep_index) - self._nkern[gridDim, blockDim, nb_stream](*dev_args, **kwargs) + if self._compiled_kernel is None: + print("compile kernel") + self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc) + + nb_stream = cuda.external_stream(t.stream_ptr()) + self._compiled_kernel[grid, block, stream](*dev_args, **kwargs) return None @@ -58,6 +69,5 @@ def _decorator(fn): def _build_kernel(pyfunc, jit_args, **jit_kwargs): - numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc) - return _CudaSTFKernel(numba_kernel) + return _CudaSTFKernel(pyfunc, jit_args, jit_kwargs) From 920f335e07838a78e0f3e2fb52fa150e531616c7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 6 Aug 2025 10:17:50 +0000 Subject: [PATCH 066/221] Add numba-cuda as a dependency --- python/cuda_cccl/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index a181dee4bfb..cce490aa9e3 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "nvidia-cuda-nvrtc-cu12", "nvidia-nvjitlink-cu12", "pynvjitlink-cu12>=0.2.4", + "numba-cuda", ] dynamic = ["version"] readme = { file = "README.md", content-type = "text/markdown" } From 587f33bd56eee1cd177f7bd98349deecdca67791 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 6 Aug 2025 10:27:25 +0000 Subject: [PATCH 067/221] Replace use of pynvjitlink patch --- .../cuda_cccl/tests/cooperative/examples/block/reduce.py | 3 +-- python/cuda_cccl/tests/cooperative/examples/block/scan.py | 3 +-- python/cuda_cccl/tests/cooperative/examples/warp/reduce.py | 3 +-- python/cuda_cccl/tests/cooperative/test_block_load.py | 3 +-- .../tests/cooperative/test_block_load_store_api.py | 3 +-- .../cuda_cccl/tests/cooperative/test_block_merge_sort.py | 3 +-- .../tests/cooperative/test_block_merge_sort_api.py | 6 +++--- .../cuda_cccl/tests/cooperative/test_block_radix_sort.py | 3 +-- .../tests/cooperative/test_block_radix_sort_api.py | 4 ++-- python/cuda_cccl/tests/cooperative/test_block_reduce.py | 5 +---- .../cuda_cccl/tests/cooperative/test_block_reduce_api.py | 7 +++---- python/cuda_cccl/tests/cooperative/test_block_scan.py | 5 +---- python/cuda_cccl/tests/cooperative/test_block_scan_api.py | 6 ++---- python/cuda_cccl/tests/cooperative/test_block_store.py | 3 +-- python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py | 3 +-- .../tests/cooperative/test_warp_merge_sort_api.py | 5 +---- python/cuda_cccl/tests/cooperative/test_warp_reduce.py | 5 +---- python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py | 3 +-- python/cuda_cccl/tests/cooperative/test_warp_scan.py | 5 +---- python/cuda_cccl/tests/cooperative/test_warp_scan_api.py | 3 +-- 20 files changed, 26 insertions(+), 55 deletions(-) diff --git a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py index 52bea0b7dc4..6daf679bd15 100644 --- a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py +++ b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py @@ -9,12 +9,11 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def custom_reduce_example(): diff --git a/python/cuda_cccl/tests/cooperative/examples/block/scan.py b/python/cuda_cccl/tests/cooperative/examples/block/scan.py index ac01342d653..5b055fa8d6f 100644 --- a/python/cuda_cccl/tests/cooperative/examples/block/scan.py +++ b/python/cuda_cccl/tests/cooperative/examples/block/scan.py @@ -9,12 +9,11 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def exclusive_sum_multiple_items_example(): diff --git a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py index 3267d7c9d87..8afce70c665 100644 --- a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py +++ b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py @@ -9,12 +9,11 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def custom_warp_reduce_example(): diff --git a/python/cuda_cccl/tests/cooperative/test_block_load.py b/python/cuda_cccl/tests/cooperative/test_block_load.py index 4765d19c8cf..79c2f4c522f 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_load.py +++ b/python/cuda_cccl/tests/cooperative/test_block_load.py @@ -9,11 +9,10 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py b/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py index 2ec97fd78fd..96458fe3ed8 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py +++ b/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py @@ -6,11 +6,10 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-end imports numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py b/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py index 3c2bed0f70a..0bf30c939f9 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py +++ b/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py @@ -10,11 +10,10 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py index 6c3113e672f..99f75255c69 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py +++ b/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py @@ -5,14 +5,14 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-begin imports -patch.patch_numba_linker(lto=True) + + # example-end imports diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py index e65b73dd6e5..a7e65974a31 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py +++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py @@ -9,12 +9,11 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64]) diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py index a90d17daf4e..9efa7ff24bf 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py +++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py @@ -5,14 +5,14 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-begin imports -patch.patch_numba_linker(lto=True) + # example-end imports diff --git a/python/cuda_cccl/tests/cooperative/test_block_reduce.py b/python/cuda_cccl/tests/cooperative/test_block_reduce.py index 2ced3782261..d7bffc14f9f 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_reduce.py +++ b/python/cuda_cccl/tests/cooperative/test_block_reduce.py @@ -16,14 +16,11 @@ row_major_tid, ) from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - - -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @pytest.mark.parametrize( diff --git a/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py b/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py index 54824f01f53..edd25e26a9c 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py +++ b/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py @@ -6,15 +6,14 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) -# example-end imports - +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +# example-end imports + def test_block_reduction(): # example-begin reduce diff --git a/python/cuda_cccl/tests/cooperative/test_block_scan.py b/python/cuda_cccl/tests/cooperative/test_block_scan.py index c3d238b6452..36ad00db053 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_scan.py +++ b/python/cuda_cccl/tests/cooperative/test_block_scan.py @@ -34,7 +34,6 @@ type_callable, typeof_impl, ) -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop from cuda.cccl.cooperative.experimental.block._block_scan import ( @@ -42,9 +41,7 @@ ) numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - -# Patching the Numba linker to enable LTO as needed. -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 class BlockPrefixCallbackOp: diff --git a/python/cuda_cccl/tests/cooperative/test_block_scan_api.py b/python/cuda_cccl/tests/cooperative/test_block_scan_api.py index cf222872903..d5cbc4f09f2 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_scan_api.py +++ b/python/cuda_cccl/tests/cooperative/test_block_scan_api.py @@ -5,14 +5,12 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - # example-begin imports -patch.patch_numba_linker(lto=True) +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-end imports diff --git a/python/cuda_cccl/tests/cooperative/test_block_store.py b/python/cuda_cccl/tests/cooperative/test_block_store.py index fe1b19ed37d..de101df3d07 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_store.py +++ b/python/cuda_cccl/tests/cooperative/test_block_store.py @@ -9,11 +9,10 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py index 085e50f38d7..bda89598441 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py @@ -6,11 +6,10 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py index 1f062ba53a7..c8cd3b7d544 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py @@ -5,14 +5,11 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop -numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - # example-begin imports -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-end imports diff --git a/python/cuda_cccl/tests/cooperative/test_warp_reduce.py b/python/cuda_cccl/tests/cooperative/test_warp_reduce.py index 4d2c7c6502b..8900c281815 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_reduce.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_reduce.py @@ -7,14 +7,11 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - - -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @pytest.mark.parametrize("T", [types.uint32, types.uint64]) diff --git a/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py b/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py index 1c5845512c4..4c09fd26c14 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py @@ -5,14 +5,13 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 # example-begin imports -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-end imports diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan.py b/python/cuda_cccl/tests/cooperative/test_warp_scan.py index 5f3e9a310d7..ea35ec77dd5 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_scan.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_scan.py @@ -7,14 +7,11 @@ import pytest from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - - -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @pytest.mark.parametrize("T", [types.uint32, types.uint64]) diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py b/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py index 5661635b3a7..108da881ef9 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py @@ -5,14 +5,13 @@ import numba import numpy as np from numba import cuda -from pynvjitlink import patch import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 # example-begin imports -patch.patch_numba_linker(lto=True) +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 # example-end imports From 9db83a2958d87dbeafe4cabbaa7d50c4de251d58 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 6 Aug 2025 08:30:25 -0400 Subject: [PATCH 068/221] Update pyproject.toml There's a bug in cuda-bindings 12.9.0 that prevents us from using CUDA 13 driver --- python/cuda_cccl/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index cce490aa9e3..6b687933827 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -19,7 +19,7 @@ requires-python = ">=3.9" dependencies = [ "numba>=0.60.0", "numpy", - "cuda-python==12.9.0", + "cuda-bindings>=12.9.1,<13.0.0", "cuda-core", "nvidia-cuda-nvrtc-cu12", "nvidia-nvjitlink-cu12", From 865d337894e4558b1f2ca6b692a16bfee588c47b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 7 Aug 2025 10:52:55 +0200 Subject: [PATCH 069/221] better class name --- python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index cd31c3d746d..8855b11efae 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -2,7 +2,7 @@ from cuda.cccl.experimental.stf import context, dep, exec_place -class _CudaSTFKernel: +class stf_kernel_decorator: def __init__(self, numba_kernel): self._nkern = numba_kernel self._launch_cfg = None # (gridDim, blockDim, context, exec_place?) @@ -24,7 +24,7 @@ def __getitem__(self, cfg): def __call__(self, *args, **kwargs): if self._launch_cfg is None: - raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](…)") + raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](...)") gridDim, blockDim, ctx, exec_pl = self._launch_cfg @@ -59,5 +59,5 @@ def _decorator(fn): def _build_kernel(pyfunc, jit_args, **jit_kwargs): numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc) - return _CudaSTFKernel(numba_kernel) + return stf_kernel_decorator(numba_kernel) From d22396062ed19bd4f14db4b9a4ddd4aaccc900fb Mon Sep 17 00:00:00 2001 From: root Date: Thu, 7 Aug 2025 10:44:07 +0000 Subject: [PATCH 070/221] fixes to make cudastf.jit decorator work --- python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 5 +++-- python/cuda_cccl/tests/stf/test_decorator.py | 4 ++-- python/cuda_cccl/tests/stf/test_numba.py | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index b7ed155cd2f..87d286b7124 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -1,6 +1,7 @@ from numba import cuda +import numba from cuda.cccl.experimental.stf import context, dep, exec_place - +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 class stf_kernel_decorator: def __init__(self, pyfunc, jit_args, jit_kwargs): @@ -52,7 +53,7 @@ def __call__(self, *args, **kwargs): self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc) nb_stream = cuda.external_stream(t.stream_ptr()) - self._compiled_kernel[grid, block, stream](*dev_args, **kwargs) + self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs) return None diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 269a7ebb70c..1a6db0f721d 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -24,6 +24,6 @@ def scale(a, x): scale[32, 64, ctx](2.0, lX.rw()) axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device -axpy[32, 64, ctx, exec_place.device(1)](2.0, lX.read(), lZ.rw()) # explicit exec place -axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(data_place.device(1))) # per-dep placement override +axpy[32, 64, ctx, cudastf.exec_place.device(0)](2.0, lX.read(), lZ.rw()) # explicit exec place +axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))) # per-dep placement override diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 3a565b0a8d7..59737cd3060 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -4,6 +4,8 @@ import numpy as np from numba import cuda +import numba +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 from cuda.cccl.experimental.stf._stf_bindings_impl import ( context, From 15c2db0d13b4618a66d617dff8d4816eea6e9545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 7 Aug 2025 23:07:59 +0200 Subject: [PATCH 071/221] revert some changes --- python/cuda_cccl/tests/cooperative/examples/block/reduce.py | 1 - python/cuda_cccl/tests/cooperative/examples/block/scan.py | 1 - python/cuda_cccl/tests/cooperative/examples/warp/reduce.py | 1 - python/cuda_cccl/tests/cooperative/test_block_radix_sort.py | 1 - python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py | 1 + python/cuda_cccl/tests/cooperative/test_warp_scan.py | 1 + 6 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py index 6daf679bd15..d6ac819a4d8 100644 --- a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py +++ b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py @@ -13,7 +13,6 @@ import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def custom_reduce_example(): diff --git a/python/cuda_cccl/tests/cooperative/examples/block/scan.py b/python/cuda_cccl/tests/cooperative/examples/block/scan.py index 5b055fa8d6f..cae62454e04 100644 --- a/python/cuda_cccl/tests/cooperative/examples/block/scan.py +++ b/python/cuda_cccl/tests/cooperative/examples/block/scan.py @@ -13,7 +13,6 @@ import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def exclusive_sum_multiple_items_example(): diff --git a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py index 8afce70c665..357efc13b42 100644 --- a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py +++ b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py @@ -13,7 +13,6 @@ import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 def custom_warp_reduce_example(): diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py index a7e65974a31..bb51a020a7c 100644 --- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py +++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py @@ -13,7 +13,6 @@ import cuda.cccl.cooperative.experimental as coop numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64]) diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py index 98f090b80f4..398abbc6a7b 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py @@ -8,6 +8,7 @@ import cuda.cccl.cooperative.experimental as coop + def test_warp_merge_sort(): # example-begin merge-sort # Define comparison operator diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan.py b/python/cuda_cccl/tests/cooperative/test_warp_scan.py index 1b283bebab6..afb81d3fcaa 100644 --- a/python/cuda_cccl/tests/cooperative/test_warp_scan.py +++ b/python/cuda_cccl/tests/cooperative/test_warp_scan.py @@ -12,6 +12,7 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + @pytest.mark.parametrize("T", [types.uint32, types.uint64]) def test_warp_exclusive_sum(T): warp_exclusive_sum = coop.warp.exclusive_sum(dtype=T) From 011e2919dc5f873f3da065e1f18b9e9a0bdcc6e9 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 7 Aug 2025 22:47:50 +0000 Subject: [PATCH 072/221] support tuple configs --- python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 87d286b7124..01929e5d4d5 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -23,7 +23,13 @@ def __getitem__(self, cfg): if exec_pl and not isinstance(exec_pl, exec_place): raise TypeError("4th item must be an exec_place") - self._launch_cfg = (int(gridDim), int(blockDim), ctx, exec_pl) + self._launch_cfg = ( + tuple(gridDim) if isinstance(gridDim, tuple) else (int(gridDim),), + tuple(blockDim) if isinstance(blockDim, tuple) else (int(blockDim),), + ctx, + exec_pl, + ) + return self def __call__(self, *args, **kwargs): From 91e9d4698fdfbab0514110083e7ac6176faa0ccc Mon Sep 17 00:00:00 2001 From: root Date: Thu, 7 Aug 2025 22:48:09 +0000 Subject: [PATCH 073/221] new test --- .../tests/stf/test_stencil_decorator.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_stencil_decorator.py diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py new file mode 100644 index 00000000000..9c0bc17182b --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -0,0 +1,75 @@ +import numpy as np +from numba import cuda +import cuda.cccl.experimental.stf as cudastf + +@cudastf.jit +def laplacian_5pt_kernel(u_in, u_out, dx, dy): + """ + Compute a 5?~@~Qpoint Laplacian on u_in and write the result to u_out. + + Grid?~@~Qstride 2?~@~QD kernel. Assumes C?~@~Qcontiguous (row?~@~Qmajor) inputs. + Boundary cells are copied unchanged. + """ + coef_x = 1.0 / (dx * dx) + coef_y = 1.0 / (dy * dy) + + i, j = cuda.grid(2) # i ?~F~T row (x?~@~Qindex), j ?~F~T col (y?~@~Qindex) + nx, ny = u_in.shape + + if i >= nx or j >= ny: + return # out?~@~Qof?~@~Qbounds threads do nothing + + if 0 < i < nx - 1 and 0 < j < ny - 1: + u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + ( + u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1] + ) * coef_y + else: + # simple Dirichlet/Neumann placeholder: copy input to output + u_out[i, j] = u_in[i, j] + + +def test_numba2d(): + nx, ny = 1024, 1024 + dx = 2.0 * np.pi / (nx - 1) + dy = 2.0 * np.pi / (ny - 1) + + # a smooth test field: f(x,y) = sin(x) * cos(y) + x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64) + y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64) + + u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) + u_out = np.zeros_like(u) + + ctx = cudastf.context() + lu = ctx.logical_data(u) + lu_out = ctx.logical_data(u_out) + + threads_per_block = (16, 16) # 256 threads per block is a solid starting point + blocks_per_grid = ( + (nx + threads_per_block[0] - 1) // threads_per_block[0], + (ny + threads_per_block[1] - 1) // threads_per_block[1], + ) + + laplacian_5pt_kernel[blocks_per_grid, threads_per_block, ctx]( + lu.read(), lu_out.write(), dx, dy + ) + + ctx.finalize() + + u_out_ref = np.zeros_like(u) + + for i in range(1, nx - 1): # skip boundaries + for j in range(1, ny - 1): + u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + ( + u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1] + ) / dy**2 + + # copy boundaries + u_out_ref[0, :] = u[0, :] + u_out_ref[-1, :] = u[-1, :] + u_out_ref[:, 0] = u[:, 0] + u_out_ref[:, -1] = u[:, -1] + + # compare with the GPU result + max_abs_diff = np.abs(u_out - u_out_ref).max() + print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") From 8be7401a58c6dc72dd2e0b8caed55a1347082809 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 9 Aug 2025 09:50:21 +0200 Subject: [PATCH 074/221] Add a new test for places (C interface) --- c/experimental/stf/test/test_places.cpp | 81 +++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 c/experimental/stf/test/test_places.cpp diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp new file mode 100644 index 00000000000..eeba229c758 --- /dev/null +++ b/c/experimental/stf/test/test_places.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("empty stf tasks", "[task]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); + + stf_logical_data(ctx, &lX, X, N * sizeof(float)); + stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + stf_logical_data(ctx, &lZ, Z, N * sizeof(float)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + auto e_place_dev0 = make_device_place(0); + stf_task_set_exec_place(t3, &e_place_dev0); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + auto d_place_dev0 = make_device_data_place(0); + stf_task_add_dep_with_dplace(t4, lZ, STF_RW, &d_place_dev0); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} From a7da2554098c94ae6784d57c2975f81bd387d600 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sat, 9 Aug 2025 09:53:24 +0200 Subject: [PATCH 075/221] clang-format --- .../cuda/cccl/experimental/stf/__init__.py | 4 +--- .../cuda/cccl/experimental/stf/decorator.py | 17 +++++++++----- python/cuda_cccl/tests/stf/test_decorator.py | 22 ++++++++++++------- python/cuda_cccl/tests/stf/test_numba.py | 3 ++- .../tests/stf/test_stencil_decorator.py | 2 ++ 5 files changed, 31 insertions(+), 17 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py index bef32849b2a..ce203e09097 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py @@ -1,10 +1,9 @@ from ._stf_bindings_impl import ( context, + data_place, dep, exec_place, - data_place, ) - from .decorator import jit # Python-side kernel launcher __all__ = [ @@ -14,4 +13,3 @@ "data_place", "jit", ] - diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 01929e5d4d5..42dfc5b774a 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -1,8 +1,11 @@ -from numba import cuda import numba +from numba import cuda + from cuda.cccl.experimental.stf import context, dep, exec_place + numba.config.CUDA_ENABLE_PYNVJITLINK = 1 + class stf_kernel_decorator: def __init__(self, pyfunc, jit_args, jit_kwargs): self._pyfunc = pyfunc @@ -34,13 +37,15 @@ def __getitem__(self, cfg): def __call__(self, *args, **kwargs): if self._launch_cfg is None: - raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](...)") + raise RuntimeError( + "launch configuration missing – use kernel[grid, block, ctx](...)" + ) gridDim, blockDim, ctx, exec_pl = self._launch_cfg dep_items = [] for i, a in enumerate(args): - print(f'got one arg {a} is dep ? {isinstance(a, dep)}') + print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): dep_items.append((i, a)) @@ -51,12 +56,14 @@ def __call__(self, *args, **kwargs): dev_args = list(args) print(dev_args) for dep_index, (pos, _) in enumerate(dep_items): - print(f'set arg {dep_index} at position {pos}') + print(f"set arg {dep_index} at position {pos}") dev_args[pos] = t.get_arg_numba(dep_index) if self._compiled_kernel is None: print("compile kernel") - self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc) + self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)( + self._pyfunc + ) nb_stream = cuda.external_stream(t.stream_ptr()) self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 1a6db0f721d..29fc61280eb 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -1,5 +1,6 @@ import numpy as np from numba import cuda + import cuda.cccl.experimental.stf as cudastf @@ -9,21 +10,26 @@ def axpy(a, x, y): if i < x.size: y[i] = a * x[i] + y[i] + @cudastf.jit def scale(a, x): i = cuda.grid(1) if i < x.size: x[i] = a * x[i] + X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) -ctx = cudastf.context() -lX = ctx.logical_data(X) -lY = ctx.logical_data(Y) -lZ = ctx.logical_data(Z) +ctx = cudastf.context() +lX = ctx.logical_data(X) +lY = ctx.logical_data(Y) +lZ = ctx.logical_data(Z) scale[32, 64, ctx](2.0, lX.rw()) -axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device -axpy[32, 64, ctx, cudastf.exec_place.device(0)](2.0, lX.read(), lZ.rw()) # explicit exec place -axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))) # per-dep placement override - +axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device +axpy[32, 64, ctx, cudastf.exec_place.device(0)]( + 2.0, lX.read(), lZ.rw() +) # explicit exec place +axpy[32, 64, ctx]( + 2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)) +) # per-dep placement override diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 59737cd3060..a77e771abe6 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -2,9 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +import numba import numpy as np from numba import cuda -import numba + numba.config.CUDA_ENABLE_PYNVJITLINK = 1 from cuda.cccl.experimental.stf._stf_bindings_impl import ( diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index 9c0bc17182b..ca3b68f2d17 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -1,7 +1,9 @@ import numpy as np from numba import cuda + import cuda.cccl.experimental.stf as cudastf + @cudastf.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): """ From 537b3b931ebefe51bb8784ef17dde99f76850431 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Aug 2025 02:59:58 -0700 Subject: [PATCH 076/221] Skit test if we have less than 2 devices --- python/cuda_cccl/tests/stf/test_numba.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index a77e771abe6..b65391c63f4 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -4,6 +4,8 @@ import numba import numpy as np +import unittest +import pytest from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -192,6 +194,10 @@ def test_numba_exec_place(): def test_numba_places(): + if len(list(cuda.gpus)) < 2: + pytest.skip("Need at least 2 GPUs") + return + X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) From d804d1b211f344f137d0cfc0937763bccc98705a Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Aug 2025 06:54:50 -0700 Subject: [PATCH 077/221] Save WIP for like_empty (broken) --- .../stf/include/cccl/c/experimental/stf/stf.h | 1 + c/experimental/stf/src/stf.cu | 14 +++++++++++ .../experimental/stf/_stf_bindings_impl.pyx | 25 +++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index caa8769a8b6..64260f14bc6 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -158,6 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); void stf_logical_data_destroy(stf_logical_data_handle ld); +void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to); // TODO // void stf_logical_data_wait(stf_logical_data_handle ld); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 33b4cf92489..d6294953e17 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -75,6 +75,20 @@ void stf_logical_data_destroy(stf_logical_data_handle ld) delete ld; } +void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to) +{ + assert(ctx); + assert(from); + assert(to); + + auto ld_typed = ctx->ctx.logical_data(from->ld.shape()); + + // Stored in its untyped version + *to = new stf_logical_data_handle_t{ld_typed}; +} + + + void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) { assert(ctx); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 01e3e2f0132..3c75758cfb7 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -116,6 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) void stf_logical_data_destroy(stf_logical_data_handle ld) + void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst) ctypedef struct stf_task_handle_t ctypedef stf_task_handle_t* stf_task_handle @@ -208,6 +209,30 @@ cdef class logical_data: def rw(self, dplace=None): return dep(self, AccessMode.RW.value, dplace) + def like_empty(self): + """ + Create a new logical_data with the same shape (and dtype metadata) + as this object. + """ + if self._ld == NULL: + raise RuntimeError("source logical_data handle is NULL") + + cdef logical_data out = logical_data.__new__(logical_data) + + out._ctx = self._ctx + out._dtype = self._dtype + out._shape = self._shape + out._ndim = self._ndim + + cdef stf_logical_data_handle new_ld = NULL + stf_logical_data_like_empty(self._ctx._ctx, &self._ld, &new_ld) + + if new_ld == NULL: + raise RuntimeError("stf_logical_data_like_empty returned NULL") + + out._ld = new_ld + return out + class dep: __slots__ = ("ld", "mode", "dplace") def __init__(self, logical_data ld, int mode, dplace=None): From ad83a6301f2f0dc98a8b4d43f042bce0fc48d3c6 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Aug 2025 06:55:13 -0700 Subject: [PATCH 078/221] test with and witjout graphs --- python/cuda_cccl/tests/stf/test_numba.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index b65391c63f4..637b0f5a1ff 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -34,12 +34,13 @@ def scale(a, x): x[i] = a * x[i] +@pytest.mark.parametrize("use_graph_val", [False, True]) def test_numba(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) - ctx = context() + ctx = context(use_graph=use_graph_val) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) From f74c1d47b81819d98f496a1b3b7b7204c669e12d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:55:41 +0000 Subject: [PATCH 079/221] [pre-commit.ci] auto code formatting --- c/experimental/stf/src/stf.cu | 2 -- python/cuda_cccl/tests/stf/test_numba.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index d6294953e17..2a493a77528 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -87,8 +87,6 @@ void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_hand *to = new stf_logical_data_handle_t{ld_typed}; } - - void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) { assert(ctx); diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 637b0f5a1ff..e4f4f66a0eb 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import numba import numpy as np -import unittest import pytest from numba import cuda From 95c88a3e93eb245dc3974c6d734b6246534f525e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 26 Aug 2025 14:47:28 +0200 Subject: [PATCH 080/221] remove unit test --- .../experimental/__stf/internal/context.cuh | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index c1075ac5b45..53ab63b2ad2 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -1451,23 +1451,6 @@ UNITTEST("make_tuple_indexwise") EXPECT(t2 == ::std::tuple(0, 2)); }; -UNITTEST("auto_dump set/get") -{ - context ctx; - - int A[1024]; - int B[1024]; - auto lA = ctx.logical_data(A); - auto lB = ctx.logical_data(B); - - // Disable auto dump - lA.set_auto_dump(false); - EXPECT(lA.get_auto_dump() == false); - - // Enabled by default - EXPECT(lB.get_auto_dump() == true); -}; - UNITTEST("cuda stream place") { cudaStream_t user_stream; From bc94c747768a2a3fd3b3cf09372f61a749c88e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 26 Aug 2025 15:22:58 +0200 Subject: [PATCH 081/221] remove stf_logical_data_like_empty which is not designed properly yet --- .../stf/include/cccl/c/experimental/stf/stf.h | 3 +- c/experimental/stf/src/stf.cu | 22 ++++----- .../experimental/stf/_stf_bindings_impl.pyx | 48 +++++++++---------- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 64260f14bc6..a6bb06353f0 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -158,7 +158,8 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); void stf_logical_data_destroy(stf_logical_data_handle ld); -void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to); +// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* +// to); // TODO // void stf_logical_data_wait(stf_logical_data_handle ld); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 2a493a77528..46215e4bff3 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -75,17 +75,17 @@ void stf_logical_data_destroy(stf_logical_data_handle ld) delete ld; } -void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to) -{ - assert(ctx); - assert(from); - assert(to); - - auto ld_typed = ctx->ctx.logical_data(from->ld.shape()); - - // Stored in its untyped version - *to = new stf_logical_data_handle_t{ld_typed}; -} +// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to) +// { +// assert(ctx); +// assert(from); +// assert(to); +// +// auto ld_typed = ctx->ctx.logical_data(from->ld.shape()); +// +// // Stored in its untyped version +// *to = new stf_logical_data_handle_t{ld_typed}; +// } void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) { diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 3c75758cfb7..542025b2f6d 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -116,7 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) void stf_logical_data_destroy(stf_logical_data_handle ld) - void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst) +# void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst) ctypedef struct stf_task_handle_t ctypedef stf_task_handle_t* stf_task_handle @@ -209,29 +209,29 @@ cdef class logical_data: def rw(self, dplace=None): return dep(self, AccessMode.RW.value, dplace) - def like_empty(self): - """ - Create a new logical_data with the same shape (and dtype metadata) - as this object. - """ - if self._ld == NULL: - raise RuntimeError("source logical_data handle is NULL") - - cdef logical_data out = logical_data.__new__(logical_data) - - out._ctx = self._ctx - out._dtype = self._dtype - out._shape = self._shape - out._ndim = self._ndim - - cdef stf_logical_data_handle new_ld = NULL - stf_logical_data_like_empty(self._ctx._ctx, &self._ld, &new_ld) - - if new_ld == NULL: - raise RuntimeError("stf_logical_data_like_empty returned NULL") - - out._ld = new_ld - return out +# def like_empty(self): +# """ +# Create a new logical_data with the same shape (and dtype metadata) +# as this object. +# """ +# if self._ld == NULL: +# raise RuntimeError("source logical_data handle is NULL") +# +# cdef logical_data out = logical_data.__new__(logical_data) +# +# out._ctx = self._ctx +# out._dtype = self._dtype +# out._shape = self._shape +# out._ndim = self._ndim +# +# cdef stf_logical_data_handle new_ld = NULL +# stf_logical_data_like_empty(self._ctx, &self._ld, &new_ld) +# +# if new_ld == NULL: +# raise RuntimeError("stf_logical_data_like_empty returned NULL") +# +# out._ld = new_ld +# return out class dep: __slots__ = ("ld", "mode", "dplace") From 3e476481a437cba6bd5d8e05a79fd6b2506e6f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 27 Aug 2025 14:37:39 +0200 Subject: [PATCH 082/221] Add a missing header --- c/parallel/src/nvrtc/command_list.h | 1 + 1 file changed, 1 insertion(+) diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h index 3c4f89548f1..303b3f06d3d 100644 --- a/c/parallel/src/nvrtc/command_list.h +++ b/c/parallel/src/nvrtc/command_list.h @@ -16,6 +16,7 @@ #include #include #include +#include #include From f208979c38e7f742d66805242d53b63bb655cbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 27 Aug 2025 18:11:22 +0200 Subject: [PATCH 083/221] Install in a place that depends on cuda version --- python/cuda_cccl/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 4872ce996b5..0b3f99edc54 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -66,9 +66,12 @@ install( DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl ) + +file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl") + install( TARGETS cccl.c.experimental.stf - DESTINATION cuda/cccl/experimental/stf/cccl + DESTINATION cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl ) # Build and install Cython extension @@ -144,6 +147,7 @@ add_custom_command( ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src} DEPENDS "${stf_pyx_source_file}" DEPFILE "${_stf_depfile}" + COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" ) set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE) add_custom_target(cythonize_stf_bindings_impl ALL From 2ca0e3d75c7b0ff73256cc6974a580c07101c211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 27 Aug 2025 18:11:56 +0200 Subject: [PATCH 084/221] fix pytest example --- python/cuda_cccl/tests/stf/test_numba.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index e4f4f66a0eb..73159f82119 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -34,13 +34,12 @@ def scale(a, x): x[i] = a * x[i] -@pytest.mark.parametrize("use_graph_val", [False, True]) def test_numba(): X = np.ones(16, dtype=np.float32) Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) - ctx = context(use_graph=use_graph_val) + ctx = context(use_graph=True) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) From 7cff926577fcf1bf7c815afdbc67a9ab58f9991a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 27 Aug 2025 19:16:25 +0200 Subject: [PATCH 085/221] Try to use an intermediate "shim" module to import cu12 or cu13 versions --- .../cuda/cccl/experimental/stf/__init__.py | 2 +- .../cccl/experimental/stf/_stf_bindings.py | 56 +++++++++++++++++++ python/cuda_cccl/tests/stf/test_context.py | 2 +- python/cuda_cccl/tests/stf/test_numba.py | 2 +- 4 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py index ce203e09097..873b31b7dcb 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py @@ -1,4 +1,4 @@ -from ._stf_bindings_impl import ( +from ._stf_bindings import ( context, data_place, dep, diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py new file mode 100644 index 00000000000..c61e908fc8d --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# _bindings.py is a shim module that imports symbols from a +# _bindings_impl extension module. The shim serves two purposes: +# +# 1. Import a CUDA-specific extension. The cuda.cccl wheel ships with multiple +# extensions, one for each CUDA version. At runtime, this shim chooses the +# appropriate extension based on the detected CUDA version, and imports all +# symbols from it. +# +# 2. Preload `nvrtc` and `nvJitLink` before importing the extension. +# These shared libraries are indirect dependencies, pulled in via the direct +# dependency `cccl.c.parallel`. To ensure reliable symbol resolution at +# runtime, we explicitly load them first using `cuda.pathfinder`. +# Without this step, importing the Cython extension directly may fail or behave +# inconsistently depending on environment setup and dynamic linker behavior. +# This indirection ensures the right loading order, regardless of how +# `_bindings` is first imported across the codebase. + +import importlib + +from cuda.cccl._cuda_version_utils import detect_cuda_version, get_recommended_extra +from cuda.pathfinder import ( # type: ignore[import-not-found] + load_nvidia_dynamic_lib, +) + + +def _load_cuda_libraries(): + # Load appropriate libraries for the detected CUDA version + for libname in ("nvrtc", "nvJitLink"): + load_nvidia_dynamic_lib(libname) + + +_load_cuda_libraries() + + +# Import the appropriate bindings implementation depending on what +# CUDA version is available: +cuda_version = detect_cuda_version() +if cuda_version not in [12, 13]: + raise RuntimeError( + f"Unsupported CUDA version: {cuda_version}. Only CUDA 12 and 13 are supported." + ) + +try: + extra_name = get_recommended_extra(cuda_version) + bindings_module = importlib.import_module( + f".{extra_name}._stf_bindings_impl", __package__ + ) + # Import all symbols from the module + globals().update(bindings_module.__dict__) +except ImportError as e: + raise ImportError( + f"Failed to import CUDA STF bindings for CUDA {cuda_version}. " + ) from e diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index 5a29199dc67..b306cf3571f 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -4,7 +4,7 @@ import numpy as np -from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw +from cuda.cccl.experimental.stf._stf_bindings import context, read, rw def test_ctx(): diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 73159f82119..11eed0b3749 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 -from cuda.cccl.experimental.stf._stf_bindings_impl import ( +from cuda.cccl.experimental.stf._stf_bindings import ( context, data_place, exec_place, From b8d89ed2638cf0294bc268b41d5148f4cf82911b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 27 Aug 2025 21:26:36 +0200 Subject: [PATCH 086/221] Fix tests (do not use graphs ...) --- python/cuda_cccl/tests/stf/test_decorator.py | 6 +++++- python/cuda_cccl/tests/stf/test_numba.py | 3 ++- python/cuda_cccl/tests/stf/test_stencil_decorator.py | 5 ++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 29fc61280eb..701b14a3574 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -1,8 +1,12 @@ import numpy as np + +import numba from numba import cuda -import cuda.cccl.experimental.stf as cudastf +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +import cuda.cccl.experimental.stf as cudastf @cudastf.jit def axpy(a, x, y): diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 11eed0b3749..aa7afac7552 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -9,6 +9,7 @@ from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 from cuda.cccl.experimental.stf._stf_bindings import ( context, @@ -39,7 +40,7 @@ def test_numba(): Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) - ctx = context(use_graph=True) + ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index ca3b68f2d17..c998ca4d00f 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -1,8 +1,11 @@ import numpy as np +import numba from numba import cuda -import cuda.cccl.experimental.stf as cudastf +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +import cuda.cccl.experimental.stf as cudastf @cudastf.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): From 740dc86c051ceca63e7e5976ccb9025d0aeb5046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 09:55:53 +0200 Subject: [PATCH 087/221] Introduce an API to enable graph capture with a low level graph_ctx task --- .../experimental/__stf/graph/graph_task.cuh | 25 ++++++++++++++++++- .../experimental/__stf/internal/context.cuh | 7 ++++++ .../cuda/experimental/__stf/internal/task.cuh | 15 +++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 2cb5398c259..ac5f3e13bee 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -100,6 +100,14 @@ public: ready_dependencies.push_back(ge->node); } } + fprintf(stderr, "graph_task::start() end\n"); + + if (is_capture_enabled()) + { + // Select a stream from the pool + capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + } return *this; } @@ -109,6 +117,13 @@ public: { ::std::lock_guard<::std::mutex> lock(graph_mutex); + if (is_capture_enabled()) + { + cudaGraph_t childGraph = nullptr; + cuda_safe_call(cudaStreamEndCapture(capture_stream, &childGraph)); + set_child_graph(childGraph); + } + cudaGraphNode_t n; auto done_prereqs = event_list(); @@ -273,6 +288,12 @@ public: return dot.is_timing() || (calibrate && statistics.is_calibrating()); } + // Only valid if we have defined a capture stream + cudaStream_t get_stream() const + { + return capture_stream; + } + /** * @brief Invokes a lambda that takes either a `cudaStream_t` or a `cudaGraph_t`. Dependencies must be * set with `add_deps` manually before this call. @@ -337,7 +358,7 @@ public: // // Get a stream from the pool associated to the execution place - cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; + capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; cudaGraph_t childGraph = nullptr; cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); @@ -455,6 +476,8 @@ private: cudaGraph_t child_graph = nullptr; bool must_destroy_child_graph = false; + cudaStream_t capture_stream; + /* If the task corresponds to independent graph nodes, we do not use a * child graph, but add nodes directly */ ::std::vector task_nodes; diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 53ab63b2ad2..56263ca4ead 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -243,6 +243,13 @@ public: return *this; } + void enable_capture() + { + payload->*[&](auto& self) { + self.enable_capture(); + }; + } + /** * @brief Add dependencies to this task. * diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh index 4136d934a16..2b9945c4d81 100644 --- a/cudax/include/cuda/experimental/__stf/internal/task.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh @@ -147,6 +147,10 @@ private: // affine data place of the execution place, but this can be a // composite data place when using a grid of places for example. data_place affine_data_place; + + // Automatically capture work when this is a graph task (ignored with a + // CUDA stream backend). + bool enable_capture = false; }; protected: @@ -363,6 +367,17 @@ public: return ::std::hash()(pimpl.get()); } + void enable_capture() + { + fprintf(stderr, "task enable capture (generic task)\n"); + pimpl->enable_capture = true; + } + + bool is_capture_enabled() const + { + return pimpl->enable_capture; + } + /** * @brief Start a task * From 9687cbb25e8cb3044c3f3a96bb304875c966caba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 09:56:54 +0200 Subject: [PATCH 088/221] Enable graph capture when launching a numba kernel in the graph_ctx backend --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 1 + c/experimental/stf/src/stf.cu | 6 ++++++ .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index a6bb06353f0..7542d156d08 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -179,6 +179,7 @@ void stf_task_end(stf_task_handle t); CUstream stf_task_get_custream(stf_task_handle t); void* stf_task_get(stf_task_handle t, int submitted_index); void stf_task_destroy(stf_task_handle t); +void stf_task_enable_capture(stf_task_handle t); typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 46215e4bff3..1a27f3be858 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -190,6 +190,12 @@ void stf_task_end(stf_task_handle t) t->t.end(); } +void stf_task_enable_capture(stf_task_handle t) +{ + assert(t); + t->t.enable_capture(); +} + CUstream stf_task_get_custream(stf_task_handle t) { assert(t); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 542025b2f6d..5f1deb5754d 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -127,6 +127,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p) void stf_task_start(stf_task_handle t) void stf_task_end(stf_task_handle t) + void stf_task_enable_capture(stf_task_handle t) CUstream stf_task_get_custream(stf_task_handle t) # cudaStream_t stf_task_get_stream(stf_task_handle t) void* stf_task_get(stf_task_handle t, int submitted_index) @@ -350,6 +351,9 @@ cdef class task: # self._lds_args.clear() def start(self): + # This is ignored if this is not a graph task + stf_task_enable_capture(self._t) + stf_task_start(self._t) def end(self): From 5246b658321414505ee253bf3ee69ceefe2f184c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 09:58:59 +0200 Subject: [PATCH 089/221] Use a forked version of numba-cuda with work-arounds for CUDA graphs --- python/cuda_cccl/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 739ae920ea7..08e5a673540 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.1.0", "cuda-core", - "numba-cuda>=0.18.0", + "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory", ] dynamic = ["version"] From 936bc60f69b4af9914a0a9229d197d8485ef71d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 09:59:54 +0200 Subject: [PATCH 090/221] fix formatting issues --- python/cuda_cccl/tests/stf/test_decorator.py | 4 ++-- python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 701b14a3574..25089684c75 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -1,6 +1,5 @@ -import numpy as np - import numba +import numpy as np from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -8,6 +7,7 @@ import cuda.cccl.experimental.stf as cudastf + @cudastf.jit def axpy(a, x, y): i = cuda.grid(1) diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index c998ca4d00f..fd845d67c64 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -1,5 +1,5 @@ -import numpy as np import numba +import numpy as np from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 From 7689834d4d0f5e8a2d12607ee664f06275311226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 11:20:00 +0200 Subject: [PATCH 091/221] Do return a stream even in the graph_ctx when we are capturing --- .../include/cuda/experimental/__stf/internal/context.cuh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 56263ca4ead..93d59f27642 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -292,12 +292,9 @@ public: cudaStream_t get_stream() const { - if (auto p = ::std::get_if>(&payload)) - { - return p->get_stream(); - } - - return nullptr; + return payload->*[&](auto& self) { + return self.get_stream(); + }; } private: From dde406dbda24b503e73dcff712b5dd800ee07fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 11:21:29 +0200 Subject: [PATCH 092/221] test with graphs --- python/cuda_cccl/tests/stf/test_numba.py | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index aa7afac7552..9e0b7414438 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -35,20 +35,34 @@ def scale(a, x): x[i] = a * x[i] -def test_numba(): +# One test with a single kernel in a CUDA graph +def test_numba_graph(): X = np.ones(16, dtype=np.float32) - Y = np.ones(16, dtype=np.float32) - Z = np.ones(16, dtype=np.float32) + ctx = context(use_graph=True) + lX = ctx.logical_data(X) + with ctx.task(rw(lX)) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = t.get_arg_numba(0) + scale[32, 64, nb_stream](2.0, dX) + pass + ctx.finalize() - ctx = context() + +def test_numba(): + n=1024*1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = context(use_graph=True) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) with ctx.task(rw(lX)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - # dX = t.get_arg_numba(0) - dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + dX = t.get_arg_numba(0) + # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) pass @@ -74,6 +88,8 @@ def test_numba(): axpy[32, 64, nb_stream](2.0, dY, dZ) pass + ctx.finalize() + @cuda.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): @@ -239,4 +255,5 @@ def test_numba_places(): if __name__ == "__main__": print("Running CUDASTF examples...") - test_numba_exec_place() + # test_numba_graph() + test_numba() From 75630141ffc8562f47a042b179b29bba798f28b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 11:53:06 +0200 Subject: [PATCH 093/221] parametrized tests --- python/cuda_cccl/tests/stf/test_decorator.py | 34 +++++++++++--------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 25089684c75..f36017d9883 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -1,5 +1,7 @@ import numba import numpy as np +import pytest + from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -22,18 +24,20 @@ def scale(a, x): x[i] = a * x[i] -X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) - -ctx = cudastf.context() -lX = ctx.logical_data(X) -lY = ctx.logical_data(Y) -lZ = ctx.logical_data(Z) - -scale[32, 64, ctx](2.0, lX.rw()) -axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device -axpy[32, 64, ctx, cudastf.exec_place.device(0)]( - 2.0, lX.read(), lZ.rw() -) # explicit exec place -axpy[32, 64, ctx]( - 2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)) -) # per-dep placement override +@pytest.mark.parametrize("use_graph", [True, False]) +def test_decorator(use_graph): + X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) + + ctx = cudastf.context(use_graph=use_graph) + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + scale[32, 64, ctx](2.0, lX.rw()) + axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device + axpy[32, 64, ctx, cudastf.exec_place.device(0)]( + 2.0, lX.read(), lZ.rw() + ) # explicit exec place + axpy[32, 64, ctx]( + 2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)) + ) # per-dep placement override From b094c27275ade18068dc526ef88ae80b9c577c8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 12:04:06 +0200 Subject: [PATCH 094/221] test that we get a stream in graph_task when capturing --- .../cuda/experimental/__stf/internal/context.cuh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 93d59f27642..8b50f924f65 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -1574,8 +1574,18 @@ UNITTEST("get_stream graph") auto t = ctx.task(token.write()); t.start(); cudaStream_t s = t.get_stream(); + // We are not capturing so there is no stream associated EXPECT(s == nullptr); t.end(); + + auto t2 = ctx.task(token.write()); + t2.enable_capture(); + t2.start(); + cudaStream_t s = t2.get_stream(); + // We are capturing so the stream used for capture is associated to the task + EXPECT(s != nullptr); + t2.end(); + ctx.finalize(); }; From 222c21608d6f15b1f13736ddbbe51b0004cdd333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 14:14:29 +0200 Subject: [PATCH 095/221] Save WIP: add a mockup of FHE example, which needs a like_empty method --- python/cuda_cccl/tests/stf/test_fhe.py | 152 +++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_fhe.py diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py new file mode 100644 index 00000000000..0fead4fb000 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -0,0 +1,152 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# A toy example to illustrate how we can compose logical operations + +import numba +import numpy as np +import pytest +from numba import cuda + +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + +from cuda.cccl.experimental.stf._stf_bindings import ( + context, + read, + rw, + write, +) + +class Plaintext: + def __init__(self, ctx, values=None, ld=None): + self.ctx = ctx + if not ld is None: + self.l = ld + if not values is None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) + self.symbol = None + + def set_symbol(self, symbol: str): + self.l.set_symbol(symbol) + self.symbol = symbol + + def convert_to_vector(self) -> bytearray: + result = bytearray(self.l.buffer) + return result + + def encrypt(self) -> "Ciphertext": + # stub: should return a Ciphertext object wrapping a LogicalData + encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR + return Ciphertext(self.ctx, encrypted) + +@cuda.jit +def and_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] & b[i] + +@cuda.jit +def or_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] | b[i] + +@cuda.jit +def not_kernel(a, out): + i = cuda.grid(1) + if i < out.size: + out[i] = ~a[i] + +class Ciphertext: + def __init__(self, ctx, values=None, ld=None): + self.ctx = ctx + if not ld is None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) + self.symbol = None + + # ~ operator + def __invert__(self): + result=Ciphertext(ctx, ld=self.l) + # result=Ciphertext(ctx, ld=self.l.like_empty()) + + with ctx.task(self.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da = t.get_arg_numba(0) + dresult = t.get_arg_numba(1) + not_kernel[32, 16, nb_stream](da, dresult) + + return result + + # | operator + def __or__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + + result=Ciphertext(ctx, ld=self.l) + # result=Ciphertext(ctx, ld=self.l.like_empty()) + + with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da = t.get_arg_numba(0) + db = t.get_arg_numba(1) + dresult = t.get_arg_numba(2) + or_kernel[32, 16, nb_stream](da, db, dresult) + + return result + + + # & operator + def __and__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + + result=Ciphertext(ctx, ld=self.l) + # result=Ciphertext(ctx, ld=self.l.like_empty()) + + with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da = t.get_arg_numba(0) + db = t.get_arg_numba(1) + dresult = t.get_arg_numba(2) + and_kernel[32, 16, nb_stream](da, db, dresult) + + return result + + def set_symbol(self, symbol: str): + self.l.set_symbol(symbol) + self.symbol = symbol + + def decrypt(self): + # reverse the toy XOR "encryption" + decrypted = bytearray([c ^ 0x42 for c in self.values]) + return Plaintext(self.ctx, decrypted) + +def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: + return (~((eA | ~eB) & (~eA | eB))) + +ctx = context(use_graph=False) + +vA = [3, 3, 2, 2, 17] +pA = Plaintext(ctx, vA) +pA.set_symbol("A") + +vB = [1, 7, 7, 7, 49] +pB = Plaintext(ctx, vB) +pB.set_symbol("B") + +eA = pA.encrypt() +eB = pB.encrypt() +out = circuit(eA, eB) + +ctx.finalize() + +# v_out = out.decrypt().values +# print("Output vector:", list(v_out)) + + From b04cebf6cd4c8b7484e0e71e55e1bf3222adc141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 15:49:54 +0200 Subject: [PATCH 096/221] Implement like_empty --- .../stf/include/cccl/c/experimental/stf/stf.h | 3 +- c/experimental/stf/src/stf.cu | 9 +++ .../experimental/stf/_stf_bindings_impl.pyx | 59 +++++++++++-------- python/cuda_cccl/tests/stf/test_fhe.py | 11 ++-- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 7542d156d08..6b05a18b158 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -158,8 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); void stf_logical_data_destroy(stf_logical_data_handle ld); -// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* -// to); +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to); // TODO // void stf_logical_data_wait(stf_logical_data_handle ld); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 1a27f3be858..e5a7e7368ec 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -75,6 +75,15 @@ void stf_logical_data_destroy(stf_logical_data_handle ld) delete ld; } +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) +{ + assert(ctx); + assert(to); + + auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); + *to = new stf_logical_data_handle_t{ld_typed}; +} + // void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to) // { // assert(ctx); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 5f1deb5754d..e8088183ae0 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -116,7 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) void stf_logical_data_destroy(stf_logical_data_handle ld) -# void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst) + void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) ctypedef struct stf_task_handle_t ctypedef stf_task_handle_t* stf_task_handle @@ -162,20 +162,35 @@ class stf_arg_cai: cdef class logical_data: cdef stf_logical_data_handle _ld + cdef stf_ctx_handle _ctx cdef object _dtype cdef tuple _shape cdef int _ndim + cdef size_t _len + + def __cinit__(self, context ctx=None, object buf=None): + if ctx is None or buf is None: + # allow creation via __new__ (eg. in like_empty) + self._ld = NULL + self._ctx = NULL + self._len = 0 + self._dtype = None + self._shape = () + self._ndim = 0 + return - def __cinit__(self, context ctx, object buf): cdef Py_buffer view cdef int flags = PyBUF_FORMAT | PyBUF_ND # request dtype + shape + self._ctx = ctx._ctx + if PyObject_GetBuffer(buf, &view, flags) != 0: raise ValueError("object doesn’t support the full buffer protocol") try: self._ndim = view.ndim + self._len = view.len self._shape = tuple(view.shape[i] for i in range(view.ndim)) self._dtype = np.dtype(view.format) stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len) @@ -210,29 +225,23 @@ cdef class logical_data: def rw(self, dplace=None): return dep(self, AccessMode.RW.value, dplace) -# def like_empty(self): -# """ -# Create a new logical_data with the same shape (and dtype metadata) -# as this object. -# """ -# if self._ld == NULL: -# raise RuntimeError("source logical_data handle is NULL") -# -# cdef logical_data out = logical_data.__new__(logical_data) -# -# out._ctx = self._ctx -# out._dtype = self._dtype -# out._shape = self._shape -# out._ndim = self._ndim -# -# cdef stf_logical_data_handle new_ld = NULL -# stf_logical_data_like_empty(self._ctx, &self._ld, &new_ld) -# -# if new_ld == NULL: -# raise RuntimeError("stf_logical_data_like_empty returned NULL") -# -# out._ld = new_ld -# return out + def like_empty(self): + """ + Create a new logical_data with the same shape (and dtype metadata) + as this object. + """ + if self._ld == NULL: + raise RuntimeError("source logical_data handle is NULL") + + cdef logical_data out = logical_data.__new__(logical_data) + stf_logical_data_empty(self._ctx, self._len, &out._ld) + out._ctx = self._ctx + out._dtype = self._dtype + out._shape = self._shape + out._ndim = self._ndim + out._len = self._len + + return out class dep: __slots__ = ("ld", "mode", "dplace") diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 0fead4fb000..0b17d1d791f 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -40,7 +40,7 @@ def convert_to_vector(self) -> bytearray: def encrypt(self) -> "Ciphertext": # stub: should return a Ciphertext object wrapping a LogicalData encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR - return Ciphertext(self.ctx, encrypted) + return Ciphertext(self.ctx, values=encrypted) @cuda.jit def and_kernel(a, b, out): @@ -72,8 +72,7 @@ def __init__(self, ctx, values=None, ld=None): # ~ operator def __invert__(self): - result=Ciphertext(ctx, ld=self.l) - # result=Ciphertext(ctx, ld=self.l.like_empty()) + result=Ciphertext(ctx, values=None, ld=self.l.like_empty()) with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -88,8 +87,7 @@ def __or__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result=Ciphertext(ctx, ld=self.l) - # result=Ciphertext(ctx, ld=self.l.like_empty()) + result=Ciphertext(ctx, ld=self.l.like_empty()) with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -106,8 +104,7 @@ def __and__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result=Ciphertext(ctx, ld=self.l) - # result=Ciphertext(ctx, ld=self.l.like_empty()) + result=Ciphertext(ctx, ld=self.l.like_empty()) with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) From 9ed5ace8e61c1e2d34fe05c1f97b09a7b442a5df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 16:16:14 +0200 Subject: [PATCH 097/221] More comprehensive FHE test --- .../stf/include/cccl/c/experimental/stf/stf.h | 2 +- c/experimental/stf/src/stf.cu | 10 +-- .../experimental/__stf/internal/context.cuh | 4 +- python/cuda_cccl/tests/stf/test_decorator.py | 5 +- python/cuda_cccl/tests/stf/test_fhe.py | 85 +++++++++++-------- python/cuda_cccl/tests/stf/test_numba.py | 4 +- .../tests/stf/test_stencil_decorator.py | 1 + 7 files changed, 63 insertions(+), 48 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 6b05a18b158..6f2f903e6c8 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -158,7 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); void stf_logical_data_destroy(stf_logical_data_handle ld); -void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to); +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to); // TODO // void stf_logical_data_wait(stf_logical_data_handle ld); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index e5a7e7368ec..b82afed3526 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -75,13 +75,13 @@ void stf_logical_data_destroy(stf_logical_data_handle ld) delete ld; } -void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to) { - assert(ctx); - assert(to); + assert(ctx); + assert(to); - auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); - *to = new stf_logical_data_handle_t{ld_typed}; + auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); + *to = new stf_logical_data_handle_t{ld_typed}; } // void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 8b50f924f65..05950afcd5e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -293,7 +293,7 @@ public: cudaStream_t get_stream() const { return payload->*[&](auto& self) { - return self.get_stream(); + return self.get_stream(); }; } @@ -1578,7 +1578,7 @@ UNITTEST("get_stream graph") EXPECT(s == nullptr); t.end(); - auto t2 = ctx.task(token.write()); + auto t2 = ctx.task(token.write()); t2.enable_capture(); t2.start(); cudaStream_t s = t2.get_stream(); diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index f36017d9883..79a198d1c4c 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -1,7 +1,6 @@ import numba import numpy as np import pytest - from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -27,12 +26,12 @@ def scale(a, x): @pytest.mark.parametrize("use_graph", [True, False]) def test_decorator(use_graph): X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) - + ctx = cudastf.context(use_graph=use_graph) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - + scale[32, 64, ctx](2.0, lX.rw()) axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device axpy[32, 64, ctx, cudastf.exec_place.device(0)]( diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 0b17d1d791f..e34bfd42834 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -5,8 +5,6 @@ # A toy example to illustrate how we can compose logical operations import numba -import numpy as np -import pytest from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -14,65 +12,78 @@ from cuda.cccl.experimental.stf._stf_bindings import ( context, - read, - rw, - write, + data_place, + exec_place, ) + class Plaintext: + # Initialize from actual values, or from a logical data def __init__(self, ctx, values=None, ld=None): self.ctx = ctx - if not ld is None: - self.l = ld - if not values is None: - self.values = bytearray(values) - self.l = ctx.logical_data(self.values) + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) self.symbol = None def set_symbol(self, symbol: str): self.l.set_symbol(symbol) self.symbol = symbol - def convert_to_vector(self) -> bytearray: - result = bytearray(self.l.buffer) - return result - def encrypt(self) -> "Ciphertext": - # stub: should return a Ciphertext object wrapping a LogicalData encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR return Ciphertext(self.ctx, values=encrypted) + def print_values(self): + with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + hvalues = t.get_arg_numba(0) + print([v for v in hvalues]) + + @cuda.jit def and_kernel(a, b, out): i = cuda.grid(1) if i < out.size: out[i] = a[i] & b[i] + @cuda.jit def or_kernel(a, b, out): i = cuda.grid(1) if i < out.size: out[i] = a[i] | b[i] + @cuda.jit def not_kernel(a, out): i = cuda.grid(1) if i < out.size: out[i] = ~a[i] + +@cuda.jit +def xor_kernel(a, out, v): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] ^ v + + class Ciphertext: def __init__(self, ctx, values=None, ld=None): self.ctx = ctx - if not ld is None: - self.l = ld + if ld is not None: + self.l = ld if values is not None: - self.values = bytearray(values) - self.l = ctx.logical_data(self.values) + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) self.symbol = None - # ~ operator + # ~ operator def __invert__(self): - result=Ciphertext(ctx, values=None, ld=self.l.like_empty()) + result = Ciphertext(ctx, values=None, ld=self.l.like_empty()) with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -82,12 +93,12 @@ def __invert__(self): return result - # | operator + # | operator def __or__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result=Ciphertext(ctx, ld=self.l.like_empty()) + result = Ciphertext(ctx, ld=self.l.like_empty()) with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -98,16 +109,16 @@ def __or__(self, other): return result - - # & operator + # & operator def __and__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result=Ciphertext(ctx, ld=self.l.like_empty()) + result = Ciphertext(ctx, ld=self.l.like_empty()) with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() da = t.get_arg_numba(0) db = t.get_arg_numba(1) dresult = t.get_arg_numba(2) @@ -120,12 +131,20 @@ def set_symbol(self, symbol: str): self.symbol = symbol def decrypt(self): - # reverse the toy XOR "encryption" - decrypted = bytearray([c ^ 0x42 for c in self.values]) - return Plaintext(self.ctx, decrypted) + result = Ciphertext(ctx, ld=self.l.like_empty()) + with ctx.task(self.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da = t.get_arg_numba(0) + dresult = t.get_arg_numba(1) + # reverse the toy XOR "encryption" + xor_kernel[32, 16, nb_stream](da, dresult, 0x42) + + return Plaintext(self.ctx, ld=result.l) + def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: - return (~((eA | ~eB) & (~eA | eB))) + return ~((eA | ~eB) & (~eA | eB)) + ctx = context(use_graph=False) @@ -141,9 +160,5 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: eB = pB.encrypt() out = circuit(eA, eB) +out.decrypt().print_values() ctx.finalize() - -# v_out = out.decrypt().values -# print("Output vector:", list(v_out)) - - diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 9e0b7414438..a52276295ac 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -49,7 +49,7 @@ def test_numba_graph(): def test_numba(): - n=1024*1024 + n = 1024 * 1024 X = np.ones(n, dtype=np.float32) Y = np.ones(n, dtype=np.float32) Z = np.ones(n, dtype=np.float32) @@ -255,5 +255,5 @@ def test_numba_places(): if __name__ == "__main__": print("Running CUDASTF examples...") - # test_numba_graph() + # test_numba_graph() test_numba() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index fd845d67c64..c20414190db 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -7,6 +7,7 @@ import cuda.cccl.experimental.stf as cudastf + @cudastf.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): """ From e27ef5b75fca980d4ff8afc0dc3e90157a5346e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 16:31:36 +0200 Subject: [PATCH 098/221] test fhe with stf decorator --- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_fhe_decorator.py diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py new file mode 100644 index 00000000000..ae8c6734f5e --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# A toy example to illustrate how we can compose logical operations + +import numba +from numba import cuda + +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + +import cuda.cccl.experimental.stf as cudastf + +from cuda.cccl.experimental.stf._stf_bindings import ( + context, + data_place, + exec_place, +) + + +class Plaintext: + # Initialize from actual values, or from a logical data + def __init__(self, ctx, values=None, ld=None): + self.ctx = ctx + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) + self.symbol = None + + def set_symbol(self, symbol: str): + self.l.set_symbol(symbol) + self.symbol = symbol + + def encrypt(self) -> "Ciphertext": + encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR + return Ciphertext(self.ctx, values=encrypted) + + def print_values(self): + with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + hvalues = t.get_arg_numba(0) + print([v for v in hvalues]) + + +@cudastf.jit +def and_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] & b[i] + + +@cudastf.jit +def or_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] | b[i] + + +@cudastf.jit +def not_kernel(a, out): + i = cuda.grid(1) + if i < out.size: + out[i] = ~a[i] + + +@cudastf.jit +def xor_kernel(a, out, v): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] ^ v + + +class Ciphertext: + def __init__(self, ctx, values=None, ld=None): + self.ctx = ctx + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values) + self.symbol = None + + # ~ operator + def __invert__(self): + result = Ciphertext(ctx, values=None, ld=self.l.like_empty()) + + not_kernel[32, 16, ctx](self.l.read(), result.l.write()) + + return result + + # | operator + def __or__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + + result = Ciphertext(ctx, ld=self.l.like_empty()) + or_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write()) + + return result + + # & operator + def __and__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + + result = Ciphertext(ctx, ld=self.l.like_empty()) + and_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write()) + + return result + + def set_symbol(self, symbol: str): + self.l.set_symbol(symbol) + self.symbol = symbol + + def decrypt(self): + result = Ciphertext(ctx, ld=self.l.like_empty()) + xor_kernel[32, 16, ctx](self.l.read(), result.l.write(), 0x42) + + return Plaintext(self.ctx, ld=result.l) + + +def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: + return ~((eA | ~eB) & (~eA | eB)) + + +ctx = context(use_graph=False) + +vA = [3, 3, 2, 2, 17] +pA = Plaintext(ctx, vA) +pA.set_symbol("A") + +vB = [1, 7, 7, 7, 49] +pB = Plaintext(ctx, vB) +pB.set_symbol("B") + +eA = pA.encrypt() +eB = pB.encrypt() +out = circuit(eA, eB) + +out.decrypt().print_values() +ctx.finalize() From 6963ec0d97ba605896002a2ea50a21dc1def82bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 28 Aug 2025 21:49:04 +0200 Subject: [PATCH 099/221] fix merge error --- .../include/cuda/experimental/__stf/graph/graph_task.cuh | 8 -------- .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 10ce8628d88..4896faa28c2 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -100,14 +100,6 @@ public: ready_dependencies.push_back(ge->node); } } - fprintf(stderr, "graph_task::start() end\n"); - - if (is_capture_enabled()) - { - // Select a stream from the pool - capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; - cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); - } if (is_capture_enabled()) { diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index e8088183ae0..c13ad04c233 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -403,6 +403,14 @@ cdef class task: cdef CUstream s = stf_task_get_custream(self._t) return s # cast pointer -> Py int + def stream_cdata(self): + """ + Return the raw CUstream as a ctypes void pointer. + This can be passed directly to torch.cuda.Stream(cdata=...). + """ + cdef CUstream s = stf_task_get_custream(self._t) + return ctypes.c_void_p( s) + def get_arg(self, index) -> int: cdef void *ptr = stf_task_get(self._t, index) return ptr From 06fab11a074df6f98563fa8baee0e816d0fb8234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 09:58:53 +0200 Subject: [PATCH 100/221] Appropriate checks --- c/experimental/stf/src/stf.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index b82afed3526..4a9cb1815c8 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -24,18 +24,14 @@ struct stf_task_handle_t void stf_ctx_create(stf_ctx_handle* ctx) { - if (ctx) - { - *ctx = new stf_ctx_handle_t{context{}}; - } + assert(ctx) + *ctx = new stf_ctx_handle_t{context{}}; } void stf_ctx_create_graph(stf_ctx_handle* ctx) { - if (ctx) - { - *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; - } + assert(ctx) + *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; } void stf_ctx_finalize(stf_ctx_handle ctx) From 2fc802e35b415eed01b77759e447ebfd4a00c080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 10:42:30 +0200 Subject: [PATCH 101/221] Add missing ; --- c/experimental/stf/src/stf.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 4a9cb1815c8..d2abedc66d6 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -24,13 +24,13 @@ struct stf_task_handle_t void stf_ctx_create(stf_ctx_handle* ctx) { - assert(ctx) + assert(ctx); *ctx = new stf_ctx_handle_t{context{}}; } void stf_ctx_create_graph(stf_ctx_handle* ctx) { - assert(ctx) + assert(ctx); *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; } From a43db62a8901558095f61dfe240c416a3d14573c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:14:32 +0200 Subject: [PATCH 102/221] - Make it possible to create a borrowed context from a handle - Infer the context in the decorator from dependencies if possible --- .../experimental/stf/_stf_bindings_impl.pyx | 45 +++++++++++++++---- .../cuda/cccl/experimental/stf/decorator.py | 45 +++++++++++++------ python/cuda_cccl/tests/stf/test_decorator.py | 11 +++-- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 1 - 4 files changed, 76 insertions(+), 26 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index c13ad04c233..e2c9fa29e26 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -243,6 +243,11 @@ cdef class logical_data: return out + def borrow_ctx_handle(self): + ctx = context(borrowed=True) + ctx.borrow_from_handle(self._ctx) + return ctx + class dep: __slots__ = ("ld", "mode", "dplace") def __init__(self, logical_data ld, int mode, dplace=None): @@ -255,6 +260,8 @@ class dep: yield self.dplace def __repr__(self): return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})" + def get_ld(self): + return self.ld def read(ld, dplace=None): return dep(ld, AccessMode.READ.value, dplace) def write(ld, dplace=None): return dep(ld, AccessMode.WRITE.value, dplace) @@ -437,20 +444,42 @@ cdef class task: cdef class context: cdef stf_ctx_handle _ctx + # Is this a context that we have borrowed ? + cdef bint _borrowed + + def __cinit__(self, bint use_graph=False, bint borrowed=False): + self._ctx = NULL + self._borrowed = borrowed + if not borrowed: + if use_graph: + stf_ctx_create_graph(&self._ctx) + else: + stf_ctx_create(&self._ctx) - def __cinit__(self, bint use_graph=False): - if use_graph: - stf_ctx_create_graph(&self._ctx) - else: - stf_ctx_create(&self._ctx) + cdef borrow_from_handle(self, stf_ctx_handle ctx_handle): + if not self._ctx == NULL: + raise RuntimeError("context already initialized") + + if not self._borrowed: + raise RuntimeError("cannot call borrow_from_handle on this context") + + self._ctx = ctx_handle + print(f"borrowing ... new ctx handle = {ctx_handle} self={self}") + + def __repr__(self): + return f"context(handle={self._ctx}, borrowed={self._borrowed})" def __dealloc__(self): - self.finalize() + if not self._borrowed: + self.finalize() def finalize(self): + if self._borrowed: + raise RuntimeError("cannot finalize borrowed context") + if self._ctx != NULL: - stf_ctx_finalize(self._ctx) - self._ctx = NULL + stf_ctx_finalize(self._ctx) + self._ctx = NULL def logical_data(self, object buf): """ diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 42dfc5b774a..7fb5e1a0337 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -12,26 +12,38 @@ def __init__(self, pyfunc, jit_args, jit_kwargs): self._jit_args = jit_args self._jit_kwargs = jit_kwargs self._compiled_kernel = None - self._launch_cfg = None # (gridDim, blockDim, context, exec_place?) + # (grid_dim, block_dim, ctx_or_none, exec_place_or_none) + self._launch_cfg = None def __getitem__(self, cfg): - if not (len(cfg) == 3 or len(cfg) == 4): - raise TypeError("use kernel[gridDim, blockDim, ctx (, exec_place)]") + # Normalize cfg into (grid_dim, block_dim, ctx, exec_pl) + if not (isinstance(cfg, tuple) or isinstance(cfg, list)): + raise TypeError("use kernel[grid, block (, ctx [, exec_place])]") + n = len(cfg) + if n not in (2, 3, 4): + raise TypeError( + "use kernel[grid, block], kernel[grid, block, ctx], or kernel[grid, block, ctx, exec_place]" + ) + + grid_dim = cfg[0] + block_dim = cfg[1] + ctx = None + exec_pl = None + + if n >= 3: + ctx = cfg[2] + + if n == 4: + exec_pl = cfg[3] - gridDim, blockDim, ctx, *rest = cfg - if not isinstance(ctx, context): - raise TypeError("3rd item must be an STF context") + # Type checks (ctx can be None; exec_pl can be None) + if ctx is not None and not isinstance(ctx, context): + raise TypeError("3rd item must be an STF context (or None to infer)") - exec_pl = rest[0] if rest else None - if exec_pl and not isinstance(exec_pl, exec_place): + if exec_pl is not None and not isinstance(exec_pl, exec_place): raise TypeError("4th item must be an exec_place") - self._launch_cfg = ( - tuple(gridDim) if isinstance(gridDim, tuple) else (int(gridDim),), - tuple(blockDim) if isinstance(blockDim, tuple) else (int(blockDim),), - ctx, - exec_pl, - ) + self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl) return self @@ -47,6 +59,11 @@ def __call__(self, *args, **kwargs): for i, a in enumerate(args): print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): + if ctx == None: + ld = a.get_ld() + # This context will be used in the __call__ method itself + # so we can create a temporary object from the handle + ctx = ld.borrow_ctx_handle() dep_items.append((i, a)) task_args = [exec_pl] if exec_pl else [] diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 79a198d1c4c..7a6ab7f3378 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -32,11 +32,16 @@ def test_decorator(use_graph): lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - scale[32, 64, ctx](2.0, lX.rw()) - axpy[32, 64, ctx](2.0, lX.read(), lY.rw()) # default device + scale[32, 64](2.0, lX.rw()) + axpy[32, 64](2.0, lX.read(), lY.rw()) axpy[32, 64, ctx, cudastf.exec_place.device(0)]( 2.0, lX.read(), lZ.rw() ) # explicit exec place - axpy[32, 64, ctx]( + axpy[32, 64]( 2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)) ) # per-dep placement override + + +if __name__ == "__main__": + print("Running CUDASTF examples...") + test_decorator(False) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index ae8c6734f5e..d80733249fb 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -11,7 +11,6 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 import cuda.cccl.experimental.stf as cudastf - from cuda.cccl.experimental.stf._stf_bindings import ( context, data_place, From 9c0767916be1879c477a4bc82f2736d75b694926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:19:17 +0200 Subject: [PATCH 103/221] invert ctx and exec place in the decorator --- .../cuda/cccl/experimental/stf/decorator.py | 19 ++++++++++--------- python/cuda_cccl/tests/stf/test_decorator.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 7fb5e1a0337..eaf07a12610 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -12,17 +12,17 @@ def __init__(self, pyfunc, jit_args, jit_kwargs): self._jit_args = jit_args self._jit_kwargs = jit_kwargs self._compiled_kernel = None - # (grid_dim, block_dim, ctx_or_none, exec_place_or_none) + # (grid_dim, block_dim, exec_place_or_none, ctx_or_none) self._launch_cfg = None def __getitem__(self, cfg): - # Normalize cfg into (grid_dim, block_dim, ctx, exec_pl) + # Normalize cfg into (grid_dim, block_dim, exec_pl, ctx) if not (isinstance(cfg, tuple) or isinstance(cfg, list)): - raise TypeError("use kernel[grid, block (, ctx [, exec_place])]") + raise TypeError("use kernel[grid, block ([, exec_place, ctx])]") n = len(cfg) if n not in (2, 3, 4): raise TypeError( - "use kernel[grid, block], kernel[grid, block, ctx], or kernel[grid, block, ctx, exec_place]" + "use kernel[grid, block], kernel[grid, block, exec_place], or kernel[grid, block, exec_place, ctx]" ) grid_dim = cfg[0] @@ -31,17 +31,18 @@ def __getitem__(self, cfg): exec_pl = None if n >= 3: - ctx = cfg[2] + exec_pl = cfg[2] if n == 4: - exec_pl = cfg[3] + ctx = cfg[3] + + if exec_pl is not None and not isinstance(exec_pl, exec_place): + raise TypeError("3rd item must be an exec_place") # Type checks (ctx can be None; exec_pl can be None) if ctx is not None and not isinstance(ctx, context): - raise TypeError("3rd item must be an STF context (or None to infer)") + raise TypeError("4th item must be an STF context (or None to infer)") - if exec_pl is not None and not isinstance(exec_pl, exec_place): - raise TypeError("4th item must be an exec_place") self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 7a6ab7f3378..49605ced878 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -34,7 +34,7 @@ def test_decorator(use_graph): scale[32, 64](2.0, lX.rw()) axpy[32, 64](2.0, lX.read(), lY.rw()) - axpy[32, 64, ctx, cudastf.exec_place.device(0)]( + axpy[32, 64, cudastf.exec_place.device(0)]( 2.0, lX.read(), lZ.rw() ) # explicit exec place axpy[32, 64]( From 947bbcc513de7397fc0c9b36cf4b3b73ba53469e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:20:45 +0200 Subject: [PATCH 104/221] fix decorator api --- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index d80733249fb..024586f743c 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -86,7 +86,7 @@ def __init__(self, ctx, values=None, ld=None): def __invert__(self): result = Ciphertext(ctx, values=None, ld=self.l.like_empty()) - not_kernel[32, 16, ctx](self.l.read(), result.l.write()) + not_kernel[32, 16](self.l.read(), result.l.write()) return result @@ -96,7 +96,7 @@ def __or__(self, other): return NotImplemented result = Ciphertext(ctx, ld=self.l.like_empty()) - or_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write()) + or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result @@ -106,7 +106,7 @@ def __and__(self, other): return NotImplemented result = Ciphertext(ctx, ld=self.l.like_empty()) - and_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write()) + and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result @@ -116,7 +116,7 @@ def set_symbol(self, symbol: str): def decrypt(self): result = Ciphertext(ctx, ld=self.l.like_empty()) - xor_kernel[32, 16, ctx](self.l.read(), result.l.write(), 0x42) + xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42) return Plaintext(self.ctx, ld=result.l) From 22b2d191608a7a8efde7e288050febbb6dbaeaca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:26:10 +0200 Subject: [PATCH 105/221] Add ciphertext.like_empty() --- python/cuda_cccl/tests/stf/test_fhe.py | 11 +++++++---- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 11 ++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index e34bfd42834..c42742e952c 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -83,7 +83,7 @@ def __init__(self, ctx, values=None, ld=None): # ~ operator def __invert__(self): - result = Ciphertext(ctx, values=None, ld=self.l.like_empty()) + result = self.like_empty() with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -98,7 +98,7 @@ def __or__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -114,7 +114,7 @@ def __and__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -131,7 +131,8 @@ def set_symbol(self, symbol: str): self.symbol = symbol def decrypt(self): - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() + with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) da = t.get_arg_numba(0) @@ -141,6 +142,8 @@ def decrypt(self): return Plaintext(self.ctx, ld=result.l) + def like_empty(self): + return Ciphertext(self.ctx, ld=self.l.like_empty()) def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 024586f743c..8ed6ebbade5 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -84,8 +84,7 @@ def __init__(self, ctx, values=None, ld=None): # ~ operator def __invert__(self): - result = Ciphertext(ctx, values=None, ld=self.l.like_empty()) - + result = self.like_empty() not_kernel[32, 16](self.l.read(), result.l.write()) return result @@ -95,7 +94,7 @@ def __or__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result @@ -105,7 +104,7 @@ def __and__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result @@ -115,11 +114,13 @@ def set_symbol(self, symbol: str): self.symbol = symbol def decrypt(self): - result = Ciphertext(ctx, ld=self.l.like_empty()) + result = self.like_empty() xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42) return Plaintext(self.ctx, ld=result.l) + def like_empty(self): + return Ciphertext(self.ctx, ld=self.l.like_empty()) def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) From 66bcde3eef60d9c87987987129946e914fc116fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:30:42 +0200 Subject: [PATCH 106/221] Removing prints --- cudax/include/cuda/experimental/__stf/internal/task.cuh | 1 - .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 2 +- python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 8 ++++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh index 2b9945c4d81..7c2535e6a97 100644 --- a/cudax/include/cuda/experimental/__stf/internal/task.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh @@ -369,7 +369,6 @@ public: void enable_capture() { - fprintf(stderr, "task enable capture (generic task)\n"); pimpl->enable_capture = true; } diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index e2c9fa29e26..49453be4ad3 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -464,7 +464,7 @@ cdef class context: raise RuntimeError("cannot call borrow_from_handle on this context") self._ctx = ctx_handle - print(f"borrowing ... new ctx handle = {ctx_handle} self={self}") + # print(f"borrowing ... new ctx handle = {ctx_handle} self={self}") def __repr__(self): return f"context(handle={self._ctx}, borrowed={self._borrowed})" diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index eaf07a12610..37ed671ae00 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -58,7 +58,7 @@ def __call__(self, *args, **kwargs): dep_items = [] for i, a in enumerate(args): - print(f"got one arg {a} is dep ? {isinstance(a, dep)}") + # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): if ctx == None: ld = a.get_ld() @@ -72,13 +72,13 @@ def __call__(self, *args, **kwargs): with ctx.task(*task_args) as t: dev_args = list(args) - print(dev_args) + # print(dev_args) for dep_index, (pos, _) in enumerate(dep_items): - print(f"set arg {dep_index} at position {pos}") + # print(f"set arg {dep_index} at position {pos}") dev_args[pos] = t.get_arg_numba(dep_index) if self._compiled_kernel is None: - print("compile kernel") + # print("compile kernel") self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)( self._pyfunc ) From 84534c8a2724a6b26cdf870ea46f169049126605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:54:22 +0200 Subject: [PATCH 107/221] do not import specific methods --- python/cuda_cccl/tests/stf/test_fhe.py | 11 +++-------- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 10 ++-------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index c42742e952c..63f2ec8dc02 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -10,12 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( - context, - data_place, - exec_place, -) - +import cuda.cccl.experimental.stf as cudastf class Plaintext: # Initialize from actual values, or from a logical data @@ -37,7 +32,7 @@ def encrypt(self) -> "Ciphertext": return Ciphertext(self.ctx, values=encrypted) def print_values(self): - with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t: + with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t: nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.get_arg_numba(0) print([v for v in hvalues]) @@ -149,7 +144,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) -ctx = context(use_graph=False) +ctx = cudastf.context(use_graph=False) vA = [3, 3, 2, 2, 17] pA = Plaintext(ctx, vA) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 8ed6ebbade5..a84ed8687c6 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -11,12 +11,6 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 import cuda.cccl.experimental.stf as cudastf -from cuda.cccl.experimental.stf._stf_bindings import ( - context, - data_place, - exec_place, -) - class Plaintext: # Initialize from actual values, or from a logical data @@ -38,7 +32,7 @@ def encrypt(self) -> "Ciphertext": return Ciphertext(self.ctx, values=encrypted) def print_values(self): - with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t: + with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t: nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.get_arg_numba(0) print([v for v in hvalues]) @@ -126,7 +120,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) -ctx = context(use_graph=False) +ctx = cudastf.context(use_graph=False) vA = [3, 3, 2, 2, 17] pA = Plaintext(ctx, vA) From acf0cce6035f71407f9bc102df09135b2eba3f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 14:56:26 +0200 Subject: [PATCH 108/221] fix decorator api --- python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index c20414190db..8e52a72f00a 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -56,7 +56,7 @@ def test_numba2d(): (ny + threads_per_block[1] - 1) // threads_per_block[1], ) - laplacian_5pt_kernel[blocks_per_grid, threads_per_block, ctx]( + laplacian_5pt_kernel[blocks_per_grid, threads_per_block]( lu.read(), lu_out.write(), dx, dy ) From 6a6e84fbe0377ae6b152acd4f34754510039635b Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 08:05:28 -0700 Subject: [PATCH 109/221] Add a pytorch experiment --- python/cuda_cccl/tests/stf/test_pytorch.py | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_pytorch.py diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py new file mode 100644 index 00000000000..41d1b2f1ca9 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +import numba +import numpy as np +import pytest +import torch +from numba import cuda + +numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + +from cuda.cccl.experimental.stf._stf_bindings import ( + context, + data_place, + exec_place, + read, + rw, + write, +) + +import torch + +def torch_from_cai(obj): + """ + Convert an object exposing the CUDA Array Interface (__cuda_array_interface__) + into a torch.Tensor (on GPU). Zero-copy if possible. + + Strategy: + 1. If obj has .to_dlpack(), use it directly. + 2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack. + """ + # Path 1: direct DLPack (Numba >=0.53, some other libs) + if hasattr(obj, "to_dlpack"): + return torch.utils.dlpack.from_dlpack(obj.to_dlpack()) + + # Path 2: via CuPy bridge + try: + import cupy as cp + except ImportError as e: + raise RuntimeError( + "Object does not support .to_dlpack and CuPy is not installed. " + "Cannot convert __cuda_array_interface__ to torch.Tensor." + ) from e + + # CuPy knows how to wrap CAI + cupy_arr = cp.asarray(obj) + return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack()) + + +def test_pytorch(): + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(rw(lX)) as t: + sptr = t.stream_ptr() + torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + with torch.cuda.stream(torch_stream): + # dX = t.get_arg_numba(0) + dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + tX = torch_from_cai(dX) + # capsule = dX.toDlpack() + # tX = torch.utils.dlpack.from_dlpack(capsule) + tX = tX*2 + pass +# nb_stream = cuda.external_stream(t.stream_ptr()) + + # dX = dX * 2 + pass + + ctx.finalize() + +if __name__ == "__main__": + print("Running CUDASTF examples...") + test_pytorch() + From 297a69bd6a562cbd49595d92e98e35aa61874f0a Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 08:17:18 -0700 Subject: [PATCH 110/221] more pytorch test --- python/cuda_cccl/tests/stf/test_pytorch.py | 40 +++++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 41d1b2f1ca9..6949292c53c 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -65,16 +65,46 @@ def test_pytorch(): sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - # dX = t.get_arg_numba(0) dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) tX = torch_from_cai(dX) - # capsule = dX.toDlpack() - # tX = torch.utils.dlpack.from_dlpack(capsule) tX = tX*2 pass -# nb_stream = cuda.external_stream(t.stream_ptr()) + pass - # dX = dX * 2 + with ctx.task(lX.read(), lY.write()) as t: + sptr = t.stream_ptr() + torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + with torch.cuda.stream(torch_stream): + dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + tX = torch_from_cai(dX) + dY = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) + tY = torch_from_cai(dY) + tY = tX*2 + pass + pass + + with ctx.task(lX.read(), lZ.write()) as t: + sptr = t.stream_ptr() + torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + with torch.cuda.stream(torch_stream): + dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + tX = torch_from_cai(dX) + dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) + tZ = torch_from_cai(dY) + tZ = tX*4 + 1 + pass + pass + + with ctx.task(lY.read(), lZ.rw()) as t: + sptr = t.stream_ptr() + torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + with torch.cuda.stream(torch_stream): + dY = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + tY = torch_from_cai(dX) + dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) + tZ = torch_from_cai(dY) + tZ = tY*2 - 3 + pass pass ctx.finalize() From 533ca5a67af0d9e8e7e7ff74196cac6efdaf204d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 08:41:01 -0700 Subject: [PATCH 111/221] better interop with pytorch --- .../experimental/stf/_stf_bindings_impl.pyx | 38 +++++++++++++++++++ python/cuda_cccl/tests/stf/test_pytorch.py | 19 ++++------ 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 49453be4ad3..185b0e03467 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -160,6 +160,40 @@ class stf_arg_cai: 'stream': self.stream, # CUDA stream for access } +import torch + +class CAIWrapper: + def __init__(self, cai_dict): + self.__cuda_array_interface__ = cai_dict + +def torch_from_cai(obj): + """ + Convert an object exposing the CUDA Array Interface (__cuda_array_interface__) + into a torch.Tensor (on GPU). Zero-copy if possible. + + Strategy: + 1. If obj has .to_dlpack(), use it directly. + 2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack. + """ + # Path 1: direct DLPack (Numba >=0.53, some other libs) + if hasattr(obj, "to_dlpack"): + return torch.utils.dlpack.from_dlpack(obj.to_dlpack()) + + # Path 2: via CuPy bridge + try: + import cupy as cp + except ImportError as e: + raise RuntimeError( + "Object does not support .to_dlpack and CuPy is not installed. " + "Cannot convert __cuda_array_interface__ to torch.Tensor." + ) from e + + #if isinstance(obj, dict) and "__cuda_array_interface__" in obj: + obj = CAIWrapper(obj) # wrap the dict + + cupy_arr = cp.asarray(obj) + return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack()) + cdef class logical_data: cdef stf_logical_data_handle _ld cdef stf_ctx_handle _ctx @@ -430,6 +464,10 @@ cdef class task: cai = self.get_arg_cai(index) return cuda.from_cuda_array_interface(cai, owner=None, sync=False) + def get_arg_as_tensor(self, index): + cai = self.get_arg_cai(index) + return torch_from_cai(cai) + # ---- context‑manager helpers ------------------------------- def __enter__(self): self.start() diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 6949292c53c..d5d4c913aea 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -67,6 +67,7 @@ def test_pytorch(): with torch.cuda.stream(torch_stream): dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) tX = torch_from_cai(dX) + # same as tX =t.get_arg_as_tensor(0) tX = tX*2 pass pass @@ -75,10 +76,8 @@ def test_pytorch(): sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) - tX = torch_from_cai(dX) - dY = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) - tY = torch_from_cai(dY) + tX =t.get_arg_as_tensor(0) + tY =t.get_arg_as_tensor(1) tY = tX*2 pass pass @@ -87,10 +86,8 @@ def test_pytorch(): sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) - tX = torch_from_cai(dX) - dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) - tZ = torch_from_cai(dY) + tX =t.get_arg_as_tensor(0) + tZ =t.get_arg_as_tensor(1) tZ = tX*4 + 1 pass pass @@ -99,10 +96,8 @@ def test_pytorch(): sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - dY = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) - tY = torch_from_cai(dX) - dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False) - tZ = torch_from_cai(dY) + tY =t.get_arg_as_tensor(0) + tZ =t.get_arg_as_tensor(1) tZ = tY*2 - 3 pass pass From 9aa749f038d37068698f4c36824e48a578a560f5 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 08:49:28 -0700 Subject: [PATCH 112/221] remove useless pass --- python/cuda_cccl/tests/stf/test_pytorch.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index d5d4c913aea..f44226f476f 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -69,8 +69,6 @@ def test_pytorch(): tX = torch_from_cai(dX) # same as tX =t.get_arg_as_tensor(0) tX = tX*2 - pass - pass with ctx.task(lX.read(), lY.write()) as t: sptr = t.stream_ptr() @@ -79,8 +77,6 @@ def test_pytorch(): tX =t.get_arg_as_tensor(0) tY =t.get_arg_as_tensor(1) tY = tX*2 - pass - pass with ctx.task(lX.read(), lZ.write()) as t: sptr = t.stream_ptr() @@ -89,8 +85,6 @@ def test_pytorch(): tX =t.get_arg_as_tensor(0) tZ =t.get_arg_as_tensor(1) tZ = tX*4 + 1 - pass - pass with ctx.task(lY.read(), lZ.rw()) as t: sptr = t.stream_ptr() @@ -99,8 +93,6 @@ def test_pytorch(): tY =t.get_arg_as_tensor(0) tZ =t.get_arg_as_tensor(1) tZ = tY*2 - 3 - pass - pass ctx.finalize() From b11aa4b3e53faa7b8c18240d34498463ffe3c837 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 09:28:00 -0700 Subject: [PATCH 113/221] tensor_arguments --- .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 4 ++++ python/cuda_cccl/tests/stf/test_pytorch.py | 6 ++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 185b0e03467..9d04737753e 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -468,6 +468,10 @@ cdef class task: cai = self.get_arg_cai(index) return torch_from_cai(cai) + def tensor_arguments(self): + arg_cnt=len(self._lds_args) + return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt)) + # ---- context‑manager helpers ------------------------------- def __enter__(self): self.start() diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index f44226f476f..8ca1585dd24 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -82,16 +82,14 @@ def test_pytorch(): sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - tX =t.get_arg_as_tensor(0) - tZ =t.get_arg_as_tensor(1) + tX, tY = t.tensor_arguments() tZ = tX*4 + 1 with ctx.task(lY.read(), lZ.rw()) as t: sptr = t.stream_ptr() torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) with torch.cuda.stream(torch_stream): - tY =t.get_arg_as_tensor(0) - tZ =t.get_arg_as_tensor(1) + tX, tZ = t.tensor_arguments() tZ = tY*2 - 3 ctx.finalize() From 0af151f2a6a27d95f7201567226dac1faa0176c9 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Aug 2025 10:19:51 -0700 Subject: [PATCH 114/221] simpler code --- python/cuda_cccl/tests/stf/test_pytorch.py | 24 ++++++++-------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 8ca1585dd24..98c0f9cb220 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -62,8 +62,7 @@ def test_pytorch(): lZ = ctx.logical_data(Z) with ctx.task(rw(lX)) as t: - sptr = t.stream_ptr() - torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) tX = torch_from_cai(dX) @@ -71,26 +70,19 @@ def test_pytorch(): tX = tX*2 with ctx.task(lX.read(), lY.write()) as t: - sptr = t.stream_ptr() - torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) + torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): tX =t.get_arg_as_tensor(0) tY =t.get_arg_as_tensor(1) tY = tX*2 - with ctx.task(lX.read(), lZ.write()) as t: - sptr = t.stream_ptr() - torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) - with torch.cuda.stream(torch_stream): - tX, tY = t.tensor_arguments() - tZ = tX*4 + 1 + with ctx.task(lX.read(), lZ.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + tX, tY = t.tensor_arguments() + tZ = tX*4 + 1 - with ctx.task(lY.read(), lZ.rw()) as t: - sptr = t.stream_ptr() - torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0")) - with torch.cuda.stream(torch_stream): - tX, tZ = t.tensor_arguments() - tZ = tY*2 - 3 + with ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + tX, tZ = t.tensor_arguments() + tZ = tY*2 - 3 ctx.finalize() From 746d308b4a82b682a3dcf2b963107a47a63110d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 29 Aug 2025 21:34:28 +0200 Subject: [PATCH 115/221] pre-commit hooks --- .../cuda/cccl/experimental/stf/decorator.py | 9 ++--- python/cuda_cccl/tests/stf/test_fhe.py | 6 ++- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 6 ++- python/cuda_cccl/tests/stf/test_pytorch.py | 40 ++++++++++--------- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 37ed671ae00..c7179d2a6fc 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -43,7 +43,6 @@ def __getitem__(self, cfg): if ctx is not None and not isinstance(ctx, context): raise TypeError("4th item must be an STF context (or None to infer)") - self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl) return self @@ -58,7 +57,7 @@ def __call__(self, *args, **kwargs): dep_items = [] for i, a in enumerate(args): - # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") + # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): if ctx == None: ld = a.get_ld() @@ -72,13 +71,13 @@ def __call__(self, *args, **kwargs): with ctx.task(*task_args) as t: dev_args = list(args) - # print(dev_args) + # print(dev_args) for dep_index, (pos, _) in enumerate(dep_items): - # print(f"set arg {dep_index} at position {pos}") + # print(f"set arg {dep_index} at position {pos}") dev_args[pos] = t.get_arg_numba(dep_index) if self._compiled_kernel is None: - # print("compile kernel") + # print("compile kernel") self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)( self._pyfunc ) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 63f2ec8dc02..9ec86af51c8 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -12,6 +12,7 @@ import cuda.cccl.experimental.stf as cudastf + class Plaintext: # Initialize from actual values, or from a logical data def __init__(self, ctx, values=None, ld=None): @@ -32,7 +33,9 @@ def encrypt(self) -> "Ciphertext": return Ciphertext(self.ctx, values=encrypted) def print_values(self): - with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t: + with ctx.task( + cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) + ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.get_arg_numba(0) print([v for v in hvalues]) @@ -140,6 +143,7 @@ def decrypt(self): def like_empty(self): return Ciphertext(self.ctx, ld=self.l.like_empty()) + def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index a84ed8687c6..bb369b6f250 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -12,6 +12,7 @@ import cuda.cccl.experimental.stf as cudastf + class Plaintext: # Initialize from actual values, or from a logical data def __init__(self, ctx, values=None, ld=None): @@ -32,7 +33,9 @@ def encrypt(self) -> "Ciphertext": return Ciphertext(self.ctx, values=encrypted) def print_values(self): - with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t: + with ctx.task( + cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) + ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.get_arg_numba(0) print([v for v in hvalues]) @@ -116,6 +119,7 @@ def decrypt(self): def like_empty(self): return Ciphertext(self.ctx, ld=self.l.like_empty()) + def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 98c0f9cb220..a487eaca1c1 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -5,7 +5,6 @@ import numba import numpy as np -import pytest import torch from numba import cuda @@ -13,15 +12,10 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 from cuda.cccl.experimental.stf._stf_bindings import ( - context, - data_place, - exec_place, - read, + context, rw, - write, ) -import torch def torch_from_cai(obj): """ @@ -64,29 +58,37 @@ def test_pytorch(): with ctx.task(rw(lX)) as t: torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): - dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) + dX = cuda.from_cuda_array_interface( + t.get_arg_cai(0), owner=None, sync=False + ) tX = torch_from_cai(dX) - # same as tX =t.get_arg_as_tensor(0) - tX = tX*2 + # same as tX =t.get_arg_as_tensor(0) + tX = tX * 2 with ctx.task(lX.read(), lY.write()) as t: torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): - tX =t.get_arg_as_tensor(0) - tY =t.get_arg_as_tensor(1) - tY = tX*2 - - with ctx.task(lX.read(), lZ.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + tX = t.get_arg_as_tensor(0) + tY = t.get_arg_as_tensor(1) + tY = tX * 2 + + with ( + ctx.task(lX.read(), lZ.write()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): tX, tY = t.tensor_arguments() - tZ = tX*4 + 1 + tZ = tX * 4 + 1 - with ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lY.read(), lZ.rw()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): tX, tZ = t.tensor_arguments() - tZ = tY*2 - 3 + tZ = tY * 2 - 3 ctx.finalize() + if __name__ == "__main__": print("Running CUDASTF examples...") test_pytorch() - From d9195f581410c8e1fb6ea9ddfdae8cc649c0c27e Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 16:50:23 +0200 Subject: [PATCH 116/221] try to remove dependency on torch and have adapters (WIP) --- .../cuda/cccl/experimental/stf/__init__.py | 8 ++++ .../stf/_adapters/numba_bridge.py | 3 ++ .../stf/_adapters/torch_bridge.py | 36 +++++++++++++++++ .../experimental/stf/_stf_bindings_impl.pyx | 40 +++---------------- 4 files changed, 52 insertions(+), 35 deletions(-) create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py index 873b31b7dcb..2e63ff6f856 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py @@ -13,3 +13,11 @@ "data_place", "jit", ] + +def has_torch() -> bool: + import importlib.util + return importlib.util.find_spec("torch") is not None + +def has_numba() -> bool: + import importlib.util + return importlib.util.find_spec("numba") is not None diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py new file mode 100644 index 00000000000..ac42377dcd2 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py @@ -0,0 +1,3 @@ +def cai_to_numba(cai: dict): + from numba import cuda as _cuda + return _cuda.from_cuda_array_interface(cai) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py new file mode 100644 index 00000000000..9bf6feea784 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +def cai_to_torch(cai: dict): + """ + Convert a __cuda_array_interface__ dict to a torch.Tensor + without making PyTorch a hard dependency of the core extension. + + Strategy (in order): + 1) Try Numba -> DLPack -> torch (fast & common). + 2) Try CuPy -> DLPack -> torch (common on CUDA setups). + 3) Otherwise, error with a clear message. + """ + import torch + + # 1) Numba bridge + try: + from numba import cuda as _cuda + dev_array = _cuda.from_cuda_array_interface(cai) + return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack()) + except Exception: + pass + + # 2) CuPy bridge + try: + import cupy as cp + + class _cai_wrapper: + def __init__(self, d): self.__cuda_array_interface__ = d + + cp_arr = cp.asarray(_cai_wrapper(cai)) + return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack()) + except Exception as e: + raise RuntimeError( + "Could not convert __cuda_array_interface__ to torch.Tensor. " + "Install numba or cupy (or expose a DLPack capsule natively)." + ) from e diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 9d04737753e..be4cce311b9 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -160,40 +160,6 @@ class stf_arg_cai: 'stream': self.stream, # CUDA stream for access } -import torch - -class CAIWrapper: - def __init__(self, cai_dict): - self.__cuda_array_interface__ = cai_dict - -def torch_from_cai(obj): - """ - Convert an object exposing the CUDA Array Interface (__cuda_array_interface__) - into a torch.Tensor (on GPU). Zero-copy if possible. - - Strategy: - 1. If obj has .to_dlpack(), use it directly. - 2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack. - """ - # Path 1: direct DLPack (Numba >=0.53, some other libs) - if hasattr(obj, "to_dlpack"): - return torch.utils.dlpack.from_dlpack(obj.to_dlpack()) - - # Path 2: via CuPy bridge - try: - import cupy as cp - except ImportError as e: - raise RuntimeError( - "Object does not support .to_dlpack and CuPy is not installed. " - "Cannot convert __cuda_array_interface__ to torch.Tensor." - ) from e - - #if isinstance(obj, dict) and "__cuda_array_interface__" in obj: - obj = CAIWrapper(obj) # wrap the dict - - cupy_arr = cp.asarray(obj) - return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack()) - cdef class logical_data: cdef stf_logical_data_handle _ld cdef stf_ctx_handle _ctx @@ -466,7 +432,11 @@ cdef class task: def get_arg_as_tensor(self, index): cai = self.get_arg_cai(index) - return torch_from_cai(cai) + try: + from cuda.cccl.experimental.stf._adapters.torch_bridge import cai_to_torch + except Exception as e: + raise RuntimeError("PyTorch support is not available") from e + return cai_to_torch(cai) def tensor_arguments(self): arg_cnt=len(self._lds_args) From f5ac828ff7bf6b90cf844f737844f0f9e8ba540c Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 16:51:52 +0200 Subject: [PATCH 117/221] remove unused code --- .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index be4cce311b9..3948ff9a4b4 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -410,14 +410,6 @@ cdef class task: cdef CUstream s = stf_task_get_custream(self._t) return s # cast pointer -> Py int - def stream_cdata(self): - """ - Return the raw CUstream as a ctypes void pointer. - This can be passed directly to torch.cuda.Stream(cdata=...). - """ - cdef CUstream s = stf_task_get_custream(self._t) - return ctypes.c_void_p( s) - def get_arg(self, index) -> int: cdef void *ptr = stf_task_get(self._t, index) return ptr From 454a5dac1af2d32e724d181174d8e48b7ccc23ab Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 18:13:22 +0200 Subject: [PATCH 118/221] cleanups --- .../experimental/stf/_stf_bindings_impl.pyx | 9 +++++--- python/cuda_cccl/tests/stf/test_numba.py | 22 ++++--------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 3948ff9a4b4..d4cda1107c4 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -13,9 +13,8 @@ from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t from libc.stdint cimport uintptr_t from libc.string cimport memset, memcpy +# TODO remove that dependency import numpy as np -from numba import cuda - from cpython.buffer cimport ( Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, @@ -420,7 +419,11 @@ cdef class task: def get_arg_numba(self, index): cai = self.get_arg_cai(index) - return cuda.from_cuda_array_interface(cai, owner=None, sync=False) + try: + from cuda.cccl.experimental.stf._adapters.numba_bridge import cai_to_numba + except Exception as e: + raise RuntimeError("numba support is not available") from e + return cai_to_numba(cai) def get_arg_as_tensor(self, index): cai = self.get_arg_cai(index) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index a52276295ac..ed7105386ea 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -44,7 +44,7 @@ def test_numba_graph(): nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) scale[32, 64, nb_stream](2.0, dX) - pass + ctx.finalize() @@ -54,7 +54,7 @@ def test_numba(): Y = np.ones(n, dtype=np.float32) Z = np.ones(n, dtype=np.float32) - ctx = context(use_graph=True) + ctx = context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) @@ -64,7 +64,6 @@ def test_numba(): dX = t.get_arg_numba(0) # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) - pass with ctx.task(read(lX), rw(lY)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -72,21 +71,18 @@ def test_numba(): dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) - pass with ctx.task(read(lX), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) - pass with ctx.task(read(lY), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dY, dZ) - pass ctx.finalize() @@ -145,7 +141,6 @@ def test_numba2d(): laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream]( du, du_out, dx, dy ) - pass ctx.finalize() @@ -183,7 +178,6 @@ def test_numba_exec_place(): # dX = t.get_arg_numba(0) dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) - pass with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -191,7 +185,6 @@ def test_numba_exec_place(): dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) - pass with ctx.task( exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed()) @@ -200,14 +193,12 @@ def test_numba_exec_place(): dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) - pass with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dY, dZ) - pass def test_numba_places(): @@ -228,7 +219,6 @@ def test_numba_places(): nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) scale[32, 64, nb_stream](2.0, dX) - pass with ctx.task(lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) @@ -236,24 +226,20 @@ def test_numba_places(): dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) - pass with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) - pass with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dY, dZ) - pass - if __name__ == "__main__": print("Running CUDASTF examples...") - # test_numba_graph() - test_numba() + test_numba_graph() + # test_numba() From ccfbb6b016b60afe7d2a543f11fb19c6aa6faedd Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 18:23:02 +0200 Subject: [PATCH 119/221] fix numba adapter --- .../cuda/cccl/experimental/stf/_adapters/numba_bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py index ac42377dcd2..7e411e91f12 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py @@ -1,3 +1,3 @@ def cai_to_numba(cai: dict): - from numba import cuda as _cuda - return _cuda.from_cuda_array_interface(cai) + from numba import cuda + return cuda.from_cuda_array_interface(cai, owner=None, sync=False) From c6e7c07ad3641cfa96f24e8a8b83a4417eeed5db Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 18:26:48 +0200 Subject: [PATCH 120/221] skip torch test if torch is not available --- python/cuda_cccl/tests/stf/test_pytorch.py | 36 ++-------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index a487eaca1c1..f5bf448b430 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -5,7 +5,8 @@ import numba import numpy as np -import torch +import pytest +torch = pytest.importorskip("torch") from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 @@ -17,33 +18,6 @@ ) -def torch_from_cai(obj): - """ - Convert an object exposing the CUDA Array Interface (__cuda_array_interface__) - into a torch.Tensor (on GPU). Zero-copy if possible. - - Strategy: - 1. If obj has .to_dlpack(), use it directly. - 2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack. - """ - # Path 1: direct DLPack (Numba >=0.53, some other libs) - if hasattr(obj, "to_dlpack"): - return torch.utils.dlpack.from_dlpack(obj.to_dlpack()) - - # Path 2: via CuPy bridge - try: - import cupy as cp - except ImportError as e: - raise RuntimeError( - "Object does not support .to_dlpack and CuPy is not installed. " - "Cannot convert __cuda_array_interface__ to torch.Tensor." - ) from e - - # CuPy knows how to wrap CAI - cupy_arr = cp.asarray(obj) - return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack()) - - def test_pytorch(): n = 1024 * 1024 X = np.ones(n, dtype=np.float32) @@ -58,11 +32,7 @@ def test_pytorch(): with ctx.task(rw(lX)) as t: torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): - dX = cuda.from_cuda_array_interface( - t.get_arg_cai(0), owner=None, sync=False - ) - tX = torch_from_cai(dX) - # same as tX =t.get_arg_as_tensor(0) + tX = t.tensor_arguments() tX = tX * 2 with ctx.task(lX.read(), lY.write()) as t: From 842a6516d64d68e1d0cac06971f2ecaf2a5dca6d Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 19:05:00 +0200 Subject: [PATCH 121/221] add dot vertex even in the low level api --- .../cuda/experimental/__stf/graph/graph_task.cuh | 11 ++++++----- .../experimental/__stf/stream/stream_task.cuh | 16 ++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 4896faa28c2..c9426f71c53 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -108,6 +108,12 @@ public: cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); } + auto& dot = ctx.get_dot(); + if (dot.is_tracing()) + { + dot.template add_vertex(*this); + } + return *this; } @@ -598,11 +604,6 @@ public: clear(); }; - if (dot.is_tracing()) - { - dot.template add_vertex(*this); - } - constexpr bool fun_invocable_stream_deps = ::std::is_invocable_v; constexpr bool fun_invocable_stream_non_void_deps = reserved::is_applicable_v>; diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index ae5dd3fb77b..4f89ff3d191 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -203,6 +203,12 @@ public: insert_dependencies(stream_grid); } + auto& dot = ctx.get_dot(); + if (dot->is_tracing()) + { + dot->template add_vertex(*this); + } + return *this; } @@ -308,11 +314,6 @@ public: clear(); }; - if (dot->is_tracing()) - { - dot->template add_vertex(*this); - } - // Default for the first argument is a `cudaStream_t`. if constexpr (::std::is_invocable_v) { @@ -575,11 +576,6 @@ public: clear(); }; - if (dot->is_tracing()) - { - dot->template add_vertex(*this); - } - if constexpr (::std::is_invocable_v) { // Invoke passing this task's stream as the first argument, followed by the slices From 00c649cd068f3cc072e2c904cdf91b7b3098991a Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 19:11:02 +0200 Subject: [PATCH 122/221] fix types --- cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index c9426f71c53..d243cb32634 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -108,7 +108,7 @@ public: cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); } - auto& dot = ctx.get_dot(); + auto& dot = *ctx.get_dot(); if (dot.is_tracing()) { dot.template add_vertex(*this); From b0fc18dbb4a70ea81e33ab8d0c6e16f58ab60489 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 19:11:28 +0200 Subject: [PATCH 123/221] pre-commit hooks --- python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py | 4 ++++ .../cuda/cccl/experimental/stf/_adapters/numba_bridge.py | 1 + .../cuda/cccl/experimental/stf/_adapters/torch_bridge.py | 5 ++++- python/cuda_cccl/tests/stf/test_numba.py | 1 + python/cuda_cccl/tests/stf/test_pytorch.py | 2 +- 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py index 2e63ff6f856..6ca687dfcb3 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py @@ -14,10 +14,14 @@ "jit", ] + def has_torch() -> bool: import importlib.util + return importlib.util.find_spec("torch") is not None + def has_numba() -> bool: import importlib.util + return importlib.util.find_spec("numba") is not None diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py index 7e411e91f12..32b160ba879 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py @@ -1,3 +1,4 @@ def cai_to_numba(cai: dict): from numba import cuda + return cuda.from_cuda_array_interface(cai, owner=None, sync=False) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py index 9bf6feea784..eda137fb577 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py @@ -1,5 +1,6 @@ from __future__ import annotations + def cai_to_torch(cai: dict): """ Convert a __cuda_array_interface__ dict to a torch.Tensor @@ -15,6 +16,7 @@ def cai_to_torch(cai: dict): # 1) Numba bridge try: from numba import cuda as _cuda + dev_array = _cuda.from_cuda_array_interface(cai) return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack()) except Exception: @@ -25,7 +27,8 @@ def cai_to_torch(cai: dict): import cupy as cp class _cai_wrapper: - def __init__(self, d): self.__cuda_array_interface__ = d + def __init__(self, d): + self.__cuda_array_interface__ = d cp_arr = cp.asarray(_cai_wrapper(cai)) return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack()) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index ed7105386ea..35fb749c68c 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -239,6 +239,7 @@ def test_numba_places(): dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dY, dZ) + if __name__ == "__main__": print("Running CUDASTF examples...") test_numba_graph() diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index f5bf448b430..8c1349b89e5 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -6,8 +6,8 @@ import numba import numpy as np import pytest + torch = pytest.importorskip("torch") -from numba import cuda numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 From 04cc07ac73a750b1b4598594dbb75e1dae08e6fa Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Sun, 31 Aug 2025 19:29:29 +0200 Subject: [PATCH 124/221] dot add_vertex is done in start() now --- .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh | 6 ------ .../cuda/experimental/__stf/internal/host_launch_scope.cuh | 5 ----- cudax/include/cuda/experimental/__stf/internal/launch.cuh | 5 ----- .../cuda/experimental/__stf/internal/parallel_for_scope.cuh | 5 ----- 4 files changed, 21 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh index a5443643f40..38011a1e844 100644 --- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh @@ -376,12 +376,6 @@ public: } } - auto& dot = *ctx.get_dot(); - if (dot.is_tracing()) - { - dot.template add_vertex(t); - } - return *this; } diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 9783dc53f38..df02b67f20c 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -139,11 +139,6 @@ public: t.clear(); }; - if (dot.is_tracing()) - { - dot.template add_vertex(t); - } - auto payload = [&]() { if constexpr (called_from_launch) { diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 6c42032a949..b0e34edc253 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -360,11 +360,6 @@ public: nvtx_range nr(t.get_symbol().c_str()); t.start(); - if (dot.is_tracing()) - { - dot.template add_vertex(t); - } - int device; cudaEvent_t start_event, end_event; diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 6c66e918015..7bf0a819415 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -583,11 +583,6 @@ public: } } - if (dot.is_tracing()) - { - dot.template add_vertex(t); - } - static constexpr bool need_reduction = (deps_ops_t::does_work || ...); # if __NVCOMPILER From bce25b8f9e220b39afe8803c97e92edf9cacb2a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 11:47:04 +0200 Subject: [PATCH 125/221] Start to implement the FDTD example in pytorch --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py new file mode 100644 index 00000000000..1a0c9483f6b --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -0,0 +1,112 @@ +import math +from typing import Tuple, Optional + +import torch + +def fdtd_3d_pytorch( + size_x: int = 100, + size_y: int = 100, + size_z: int = 100, + timesteps: int = 10, + output_freq: int = 0, + dx: float = 0.01, + dy: float = 0.01, + dz: float = 0.01, + epsilon0: float = 8.85e-12, + mu0: float = 1.256e-6, + device: Optional[torch.device] = None, + dtype: torch.dtype = torch.float64, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # allocate fields + shape = (size_x, size_y, size_z) + ex = torch.zeros(shape, dtype=dtype, device=device) + ey = torch.zeros_like(ex) + ez = torch.zeros_like(ex) + + hx = torch.zeros_like(ex) + hy = torch.zeros_like(ex) + hz = torch.zeros_like(ex) + + epsilon = torch.full(shape, float(epsilon0), dtype=dtype, device=device) + mu = torch.full(shape, float(mu0), dtype=dtype, device=device) + + # CFL (same formula as example) + dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) + + # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1 + i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1) + i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2) + + # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1 + i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1) + i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None) + + # source location (single cell at center) + cx, cy, cz = size_x // 2, size_y // 2, size_z // 2 + + def source(t: float, x: float, y: float, z: float) -> float: + # sin(k*x - omega*t) with f = 1e9 Hz + pi = math.pi + freq = 1.0e9 + omega = 2.0 * pi * freq + wavelength = 3.0e8 / freq + k = 2.0 * pi / wavelength + return math.sin(k * x - omega * t) + + for n in range(int(timesteps)): + # ------------------------- + # update electric fields (Es) + # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * ( + (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) + - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) + ) + + # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * ( + (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) + - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) + ) + + # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * ( + (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) + - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) + ) + + # source at center cell + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + + # ------------------------- + # update magnetic fields (Hs) + # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * ( + (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) + - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) + ) + + # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * ( + (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) + - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) + ) + + # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * ( + (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) + - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) + ) + + if output_freq > 0 and (n % output_freq) == 0: + print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + + return ex, ey, ez, hx, hy, hz + + +if __name__ == "__main__": + # quick check + ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) + print("done; Ez(center) =", ez[50, 50, 50].item()) From d9c5f1194e3d0f35d1871e94c0dc2654d711841f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 12:02:48 +0200 Subject: [PATCH 126/221] Start to port in STF version of pytorch --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 106 ++++++++++++------ 1 file changed, 69 insertions(+), 37 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 1a0c9483f6b..f8cc500a026 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -1,6 +1,11 @@ import math from typing import Tuple, Optional +from cuda.cccl.experimental.stf._stf_bindings import ( + context, + rw, +) + import torch def fdtd_3d_pytorch( @@ -17,21 +22,32 @@ def fdtd_3d_pytorch( device: Optional[torch.device] = None, dtype: torch.dtype = torch.float64, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - if device is None: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + ctx = context() # allocate fields shape = (size_x, size_y, size_z) - ex = torch.zeros(shape, dtype=dtype, device=device) - ey = torch.zeros_like(ex) - ez = torch.zeros_like(ex) + ex_ = torch.zeros(shape, dtype=dtype, device=device) + ey_ = torch.zeros_like(ex_) + ez_ = torch.zeros_like(ex_) + + hx_ = torch.zeros_like(ex_) + hy_ = torch.zeros_like(ex_) + hz_ = torch.zeros_like(ex_) - hx = torch.zeros_like(ex) - hy = torch.zeros_like(ex) - hz = torch.zeros_like(ex) + epsilon_ = torch.full(shape, float(epsilon0), dtype=dtype, device=device) + mu_ = torch.full(shape, float(mu0), dtype=dtype, device=device) - epsilon = torch.full(shape, float(epsilon0), dtype=dtype, device=device) - mu = torch.full(shape, float(mu0), dtype=dtype, device=device) + lex = ctx.logical_data(ex_) + ley = ctx.logical_data(ey_) + lez = ctx.logical_data(ez_) + + lhx = ctx.logical_data(hx_) + lhy = ctx.logical_data(hy_) + lhz = ctx.logical_data(hz_) + + lepsilon = ctx.logical_data(epsilon_) + lmu = ctx.logical_data(mu_) # CFL (same formula as example) dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) @@ -60,48 +76,64 @@ def source(t: float, x: float, y: float, z: float) -> float: # ------------------------- # update electric fields (Es) # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] - ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * ( - (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) - - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) - ) + with ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ex, hy, hz, epsilon = t.tensor_arguments() + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * ( + (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) + - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) + ) # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] - ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * ( - (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) - - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) - ) + with ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ey, hx, hz, epsilon = t.tensor_arguments() + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * ( + (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) + - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) + ) # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] - ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * ( - (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) - - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) - ) + with ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ez, hx, hy, epsilon = t.tensor_arguments() + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * ( + (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) + - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) + ) # source at center cell - ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ez = t.tensor_arguments() + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) # ------------------------- # update magnetic fields (Hs) # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] - hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * ( - (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) - - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) - ) + with ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + hx, ey, ez, mu = t.tensor_arguments() + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * ( + (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) + - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) + ) # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] - hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * ( - (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) - - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) - ) + with ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + hy, ex, ez, mu = t.tensor_arguments() + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * ( + (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) + - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) + ) # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] - hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * ( - (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) - - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) - ) + with ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + hz, ex, ey, mu = t.tensor_arguments() + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * ( + (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) + - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) + ) + +#  if output_freq > 0 and (n % output_freq) == 0: +#  print(f"{n}\t{ez[cx, cy, cz].item():.6e}") - if output_freq > 0 and (n % output_freq) == 0: - print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + ctx.finalize() return ex, ey, ez, hx, hy, hz From 70fa5d8c001b51e0d6de180d22ffc9b3fa8d9c3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 13:54:51 +0200 Subject: [PATCH 127/221] Adapt the FDTD example to use STF constructs and add methods to initialize a logical data by its shape --- .../experimental/stf/_stf_bindings_impl.pyx | 21 +++++- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 67 +++++++++++-------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index d4cda1107c4..3b4ba4db39b 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -12,6 +12,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t from libc.stdint cimport uintptr_t from libc.string cimport memset, memcpy +import math # for math.prod # TODO remove that dependency import numpy as np @@ -168,7 +169,7 @@ cdef class logical_data: cdef int _ndim cdef size_t _len - def __cinit__(self, context ctx=None, object buf=None): + def __cinit__(self, context ctx=None, object buf=None, shape=None, dtype=None): if ctx is None or buf is None: # allow creation via __new__ (eg. in like_empty) self._ld = NULL @@ -242,6 +243,21 @@ cdef class logical_data: return out + @staticmethod + def init_by_shape(context ctx, shape, dtype): + """ + Create a new logical_data from a shape and a dtype + """ + cdef logical_data out = logical_data.__new__(logical_data) + out._ctx = ctx._ctx + out._dtype = np.dtype(dtype) + out._shape = shape + out._ndim = len(shape) + out._len = math.prod(shape) * out._dtype.itemsize + stf_logical_data_empty(ctx._ctx, out._len, &out._ld) + + return out + def borrow_ctx_handle(self): ctx = context(borrowed=True) ctx.borrow_from_handle(self._ctx) @@ -499,6 +515,9 @@ cdef class context: """ return logical_data(self, buf) + def logical_data_by_shape(self, shape, dtype): + return logical_data.init_by_shape(self, shape, dtype) + def task(self, *args): """ Create a `task` diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index f8cc500a026..fa7d49d976b 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -1,5 +1,6 @@ import math from typing import Tuple, Optional +import numpy as np from cuda.cccl.experimental.stf._stf_bindings import ( context, @@ -8,6 +9,11 @@ import torch +def init_field(ctx, ld, value): + with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + field = t.get_arg_as_tensor(0) + field[:,:,:] = value + def fdtd_3d_pytorch( size_x: int = 100, size_y: int = 100, @@ -27,28 +33,33 @@ def fdtd_3d_pytorch( # allocate fields shape = (size_x, size_y, size_z) - ex_ = torch.zeros(shape, dtype=dtype, device=device) - ey_ = torch.zeros_like(ex_) - ez_ = torch.zeros_like(ex_) - - hx_ = torch.zeros_like(ex_) - hy_ = torch.zeros_like(ex_) - hz_ = torch.zeros_like(ex_) - - epsilon_ = torch.full(shape, float(epsilon0), dtype=dtype, device=device) - mu_ = torch.full(shape, float(mu0), dtype=dtype, device=device) - - lex = ctx.logical_data(ex_) - ley = ctx.logical_data(ey_) - lez = ctx.logical_data(ez_) - - lhx = ctx.logical_data(hx_) - lhy = ctx.logical_data(hy_) - lhz = ctx.logical_data(hz_) - - lepsilon = ctx.logical_data(epsilon_) - lmu = ctx.logical_data(mu_) - +# ex_ = torch.zeros(shape, dtype=dtype, device=device) + lex = ctx.logical_data_by_shape(shape, np.float64) + ley = ctx.logical_data_by_shape(shape, np.float64) + lez = ctx.logical_data_by_shape(shape, np.float64) + + # epsilon_ = torch.full(shape, float(epsilon0), np.float64=np.float64, device=device) + # mu_ = torch.full(shape, float(mu0), np.float64=np.float64, device=device) + + lhx = ctx.logical_data_by_shape(shape, np.float64) + lhy = ctx.logical_data_by_shape(shape, np.float64) + lhz = ctx.logical_data_by_shape(shape, np.float64) + + # lepsilon = ctx.logical_data() + # lmu = ctx.logical_data(mu_) + lepsilon = ctx.logical_data_by_shape(shape, np.float64) + lmu = ctx.logical_data_by_shape(shape, np.float64) + + # TODO ctx.full(...) + init_field(ctx, lex, float(0.0)) + init_field(ctx, ley, float(0.0)) + init_field(ctx, lez, float(0.0)) + init_field(ctx, lhx, float(0.0)) + init_field(ctx, lhy, float(0.0)) + init_field(ctx, lhz, float(0.0)) + init_field(ctx, lepsilon, float(epsilon0)) + init_field(ctx, lmu, float(mu0)) + # CFL (same formula as example) dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) @@ -99,10 +110,10 @@ def source(t: float, x: float, y: float, z: float) -> float: - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) ) - # source at center cell - with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): - ez = t.tensor_arguments() - ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) +  # source at center cell +  with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): +  ez = t.get_arg_as_tensor(0) +  ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) # ------------------------- # update magnetic fields (Hs) @@ -140,5 +151,5 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # quick check - ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) - print("done; Ez(center) =", ez[50, 50, 50].item()) + ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=20, output_freq=5) + #  print("done; Ez(center) =", ez[50, 50, 50].item()) From 5587a8d256f39324b49aab6317062f85de93b0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 14:05:04 +0200 Subject: [PATCH 128/221] format issue --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index fa7d49d976b..2434e60abf1 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -38,8 +38,8 @@ def fdtd_3d_pytorch( ley = ctx.logical_data_by_shape(shape, np.float64) lez = ctx.logical_data_by_shape(shape, np.float64) - # epsilon_ = torch.full(shape, float(epsilon0), np.float64=np.float64, device=device) - # mu_ = torch.full(shape, float(mu0), np.float64=np.float64, device=device) + # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device) + # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device) lhx = ctx.logical_data_by_shape(shape, np.float64) lhy = ctx.logical_data_by_shape(shape, np.float64) @@ -59,7 +59,7 @@ def fdtd_3d_pytorch( init_field(ctx, lhz, float(0.0)) init_field(ctx, lepsilon, float(epsilon0)) init_field(ctx, lmu, float(mu0)) - + # CFL (same formula as example) dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) From 5ea524360b8c146bc496034d387154dc75d9baca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 14:08:51 +0200 Subject: [PATCH 129/221] charset issue --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 2434e60abf1..2049c8ada49 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -110,10 +110,10 @@ def source(t: float, x: float, y: float, z: float) -> float: - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) ) -  # source at center cell -  with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): -  ez = t.get_arg_as_tensor(0) -  ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + # source at center cell + with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ez = t.get_arg_as_tensor(0) + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) # ------------------------- # update magnetic fields (Hs) From f7fbd346e412f86a5597a30a6776b99edb1b6dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 14:30:23 +0200 Subject: [PATCH 130/221] rank agnostic method to init --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 2049c8ada49..bd6ee8a9c30 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -12,7 +12,7 @@ def init_field(ctx, ld, value): with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): field = t.get_arg_as_tensor(0) - field[:,:,:] = value + field.fill_(value) def fdtd_3d_pytorch( size_x: int = 100, From aec2d711c2dac0c5e0c34258aa798fd230f6e486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 14:34:40 +0200 Subject: [PATCH 131/221] use .zero_() to blank fields --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index bd6ee8a9c30..86712c4754a 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -12,7 +12,10 @@ def init_field(ctx, ld, value): with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): field = t.get_arg_as_tensor(0) - field.fill_(value) + if value == 0: + field.zero_() + else: + field.fill_(value) def fdtd_3d_pytorch( size_x: int = 100, From eb71880f77ae294217b13f5e112e2d4d19b44cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 14:41:54 +0200 Subject: [PATCH 132/221] print values --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 86712c4754a..788cb699140 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -144,8 +144,11 @@ def source(t: float, x: float, y: float, z: float) -> float: - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) ) -#  if output_freq > 0 and (n % output_freq) == 0: -#  print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + if output_freq > 0 and (n % output_freq) == 0: + with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + ez = t.get_arg_as_tensor(0) + print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + pass ctx.finalize() @@ -154,5 +157,5 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # quick check - ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=20, output_freq=5) + ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=200, output_freq=5) #  print("done; Ez(center) =", ez[50, 50, 50].item()) From aaf6ec6bba1c7c5ddb8388145772986efe966f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 1 Sep 2025 15:38:07 +0200 Subject: [PATCH 133/221] Experiment to display output as an image --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 46 +++++++++++++++++-- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 788cb699140..59f1583cd07 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -1,6 +1,9 @@ import math from typing import Tuple, Optional import numpy as np +import matplotlib.pyplot as plt +from matplotlib.colors import SymLogNorm, LogNorm +from typing import Literal, Optional from cuda.cccl.experimental.stf._stf_bindings import ( context, @@ -9,6 +12,38 @@ import torch +Plane = Literal["xy", "xz", "yz"] + +def show_slice(t3d, plane="xy", index=None): + # grab a 2D view + if plane == "xy": + idx = t3d.shape[2] // 2 if index is None else index + slice2d = t3d[:, :, idx] + elif plane == "xz": + idx = t3d.shape[1] // 2 if index is None else index + slice2d = t3d[:, idx, :] + elif plane == "yz": + idx = t3d.shape[0] // 2 if index is None else index + slice2d = t3d[idx, :, :] + else: + raise ValueError("plane must be 'xy', 'xz' or 'yz'") + + # move to cpu numpy array + arr = slice2d.detach().cpu().numpy() + + # imshow = "imshow" not "imread" + plt.imshow( + arr, + origin="lower", + cmap="seismic", + vmin=-1e-2, vmax=1e-2 +# norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) +# norm=LogNorm(vmin=1e-12, vmax=1e-6) + ) + # plt.colorbar() + plt.show(block=False) + plt.pause(0.01) + def init_field(ctx, ld, value): with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): field = t.get_arg_as_tensor(0) @@ -18,9 +53,9 @@ def init_field(ctx, ld, value): field.fill_(value) def fdtd_3d_pytorch( - size_x: int = 100, - size_y: int = 100, - size_z: int = 100, + size_x: int = 150, + size_y: int = 150, + size_z: int = 150, timesteps: int = 10, output_freq: int = 0, dx: float = 0.01, @@ -75,7 +110,7 @@ def fdtd_3d_pytorch( i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None) # source location (single cell at center) - cx, cy, cz = size_x // 2, size_y // 2, size_z // 2 + cx, cy, cz = size_x // 2, size_y // 10, size_z // 2 def source(t: float, x: float, y: float, z: float) -> float: # sin(k*x - omega*t) with f = 1e9 Hz @@ -148,6 +183,7 @@ def source(t: float, x: float, y: float, z: float) -> float: with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): ez = t.get_arg_as_tensor(0) print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + show_slice(ez, plane="xy") pass ctx.finalize() @@ -157,5 +193,5 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # quick check - ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=200, output_freq=5) + ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) #  print("done; Ez(center) =", ez[50, 50, 50].item()) From ae4c6d6c77930bd3ea9ee6912b09bfbc76ebb6b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 09:21:02 +0200 Subject: [PATCH 134/221] Use non blocking API --- .../cuda/cccl/experimental/stf/_adapters/torch_bridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py index eda137fb577..945597fb360 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py @@ -17,7 +17,7 @@ def cai_to_torch(cai: dict): try: from numba import cuda as _cuda - dev_array = _cuda.from_cuda_array_interface(cai) + dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False) return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack()) except Exception: pass From 9029fda7684f639d60ba323265fa8e191ba78569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 13:07:29 +0200 Subject: [PATCH 135/221] remove dead code --- c/experimental/stf/CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index f151e8bf766..11599edaec7 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -36,12 +36,6 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cuda_driver CCCL::cudax ) -# target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) -# target_compile_definitions(cccl.c.experimental.stf PRIVATE -# NVRTC_GET_TYPE_NAME=1 -# CUB_DISABLE_CDP=1 -# CUB_DEFINE_RUNTIME_POLICIES -# ) target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--expt-relaxed-constexpr>) target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--extended-lambda>) From ce7a33bdf3ddba5ddd4a6b8698a6832eb08649ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 13:08:18 +0200 Subject: [PATCH 136/221] remove dead code --- c/experimental/stf/src/stf.cu | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index d2abedc66d6..cec00fbca6f 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -273,27 +273,6 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k) k->k.start(); } -#if 0 -// -// template -// void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args) -void stf_cuda_kernel_add_desc(stf_cuda_kernel_handle k, const void *func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args) -{ - /* We convert the function to a CUfunction because this code is a shared - * library which cannot launch kernels using cudaLaunchKernel directly, or we - * will get invalid device function. */ - //CUfunction cufunc; - //cudaGetFuncBySymbol(&cufunc, (void *)func); - CUkernel cukernel; - cudaGetKernel(&cukernel, (void *)func); - - cuda_kernel_desc desc; - desc.configure_raw(cukernel, gridDim_, blockDim_, sharedMem_, arg_cnt, args); - - k->k.add_kernel_desc(mv(desc)); -} -#endif - void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, From cbde742266e2544a787d76104356fa5bdc405a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 13:24:39 +0200 Subject: [PATCH 137/221] minor cleanup --- c/experimental/stf/src/stf.cu | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index cec00fbca6f..0a92d86b677 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -80,18 +80,6 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_ *to = new stf_logical_data_handle_t{ld_typed}; } -// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to) -// { -// assert(ctx); -// assert(from); -// assert(to); -// -// auto ld_typed = ctx->ctx.logical_data(from->ld.shape()); -// -// // Stored in its untyped version -// *to = new stf_logical_data_handle_t{ld_typed}; -// } - void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) { assert(ctx); @@ -305,4 +293,5 @@ void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) assert(t); delete t; } -} + +} // extern "C" From c91e814304f9b063acc229d1fd6e8e5cf04a0e33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 13:58:39 +0200 Subject: [PATCH 138/221] clang-format --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 111 ++++++++++++------ 1 file changed, 76 insertions(+), 35 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 59f1583cd07..ccac389d6a6 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -1,19 +1,17 @@ import math -from typing import Tuple, Optional -import numpy as np +from typing import Literal, Optional, Tuple + import matplotlib.pyplot as plt -from matplotlib.colors import SymLogNorm, LogNorm -from typing import Literal, Optional +import numpy as np +import torch from cuda.cccl.experimental.stf._stf_bindings import ( context, - rw, ) -import torch - Plane = Literal["xy", "xz", "yz"] + def show_slice(t3d, plane="xy", index=None): # grab a 2D view if plane == "xy": @@ -36,22 +34,28 @@ def show_slice(t3d, plane="xy", index=None): arr, origin="lower", cmap="seismic", - vmin=-1e-2, vmax=1e-2 -# norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) -# norm=LogNorm(vmin=1e-12, vmax=1e-6) + vmin=-1e-2, + vmax=1e-2, + # norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) + # norm=LogNorm(vmin=1e-12, vmax=1e-6) ) - # plt.colorbar() + # plt.colorbar() plt.show(block=False) plt.pause(0.01) + def init_field(ctx, ld, value): - with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(ld.write()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): field = t.get_arg_as_tensor(0) if value == 0: field.zero_() else: field.fill_(value) + def fdtd_3d_pytorch( size_x: int = 150, size_y: int = 150, @@ -65,30 +69,31 @@ def fdtd_3d_pytorch( mu0: float = 1.256e-6, device: Optional[torch.device] = None, dtype: torch.dtype = torch.float64, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - +) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: ctx = context() # allocate fields shape = (size_x, size_y, size_z) -# ex_ = torch.zeros(shape, dtype=dtype, device=device) + # ex_ = torch.zeros(shape, dtype=dtype, device=device) lex = ctx.logical_data_by_shape(shape, np.float64) ley = ctx.logical_data_by_shape(shape, np.float64) lez = ctx.logical_data_by_shape(shape, np.float64) - # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device) - # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device) + # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device) + # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device) lhx = ctx.logical_data_by_shape(shape, np.float64) lhy = ctx.logical_data_by_shape(shape, np.float64) lhz = ctx.logical_data_by_shape(shape, np.float64) - # lepsilon = ctx.logical_data() - # lmu = ctx.logical_data(mu_) + # lepsilon = ctx.logical_data() + # lmu = ctx.logical_data(mu_) lepsilon = ctx.logical_data_by_shape(shape, np.float64) lmu = ctx.logical_data_by_shape(shape, np.float64) - # TODO ctx.full(...) + # TODO ctx.full(...) init_field(ctx, lex, float(0.0)) init_field(ctx, ley, float(0.0)) init_field(ctx, lez, float(0.0)) @@ -125,62 +130,98 @@ def source(t: float, x: float, y: float, z: float) -> float: # ------------------------- # update electric fields (Es) # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] - with ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): ex, hy, hz, epsilon = t.tensor_arguments() - ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * ( + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dx) + ) * ( (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) ) # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] - with ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): ey, hx, hz, epsilon = t.tensor_arguments() - ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * ( + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dy) + ) * ( (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) ) # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] - with ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): ez, hx, hy, epsilon = t.tensor_arguments() - ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * ( + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dz) + ) * ( (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) ) # source at center cell - with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lez.rw()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): ez = t.get_arg_as_tensor(0) ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) # ------------------------- # update magnetic fields (Hs) # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] - with ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): hx, ey, ez, mu = t.tensor_arguments() - hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * ( + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dy) + ) * ( (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) ) # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] - with ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): hy, ex, ez, mu = t.tensor_arguments() - hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * ( + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dz) + ) * ( (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) ) # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] - with ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): hz, ex, ey, mu = t.tensor_arguments() - hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * ( + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dx) + ) * ( (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) ) if output_freq > 0 and (n % output_freq) == 0: - with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())): + with ( + ctx.task(lez.read()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): ez = t.get_arg_as_tensor(0) print(f"{n}\t{ez[cx, cy, cz].item():.6e}") show_slice(ez, plane="xy") @@ -194,4 +235,4 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # quick check ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) - #  print("done; Ez(center) =", ez[50, 50, 50].item()) +# print("done; Ez(center) =", ez[50, 50, 50].item()) From 3fe6178b90d39853cd26cfd910aa54fe8de2eccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 15:12:06 +0200 Subject: [PATCH 139/221] Add a C library for CUDASTF (to be used in the python bindings) --- c/CMakeLists.txt | 1 + c/experimental/stf/CMakeLists.txt | 52 +++ .../stf/include/cccl/c/experimental/stf/stf.h | 224 +++++++++++++ c/experimental/stf/src/stf.cu | 297 ++++++++++++++++++ c/experimental/stf/test/CMakeLists.txt | 39 +++ c/experimental/stf/test/test_ctx.cpp | 21 ++ c/experimental/stf/test/test_cuda_kernel.cu | 90 ++++++ c/experimental/stf/test/test_logical_data.cpp | 39 +++ c/experimental/stf/test/test_places.cpp | 81 +++++ c/experimental/stf/test/test_task.cpp | 78 +++++ c/experimental/stf/test/test_token.cpp | 78 +++++ 11 files changed, 1000 insertions(+) create mode 100644 c/experimental/stf/CMakeLists.txt create mode 100644 c/experimental/stf/include/cccl/c/experimental/stf/stf.h create mode 100644 c/experimental/stf/src/stf.cu create mode 100644 c/experimental/stf/test/CMakeLists.txt create mode 100644 c/experimental/stf/test/test_ctx.cpp create mode 100644 c/experimental/stf/test/test_cuda_kernel.cu create mode 100644 c/experimental/stf/test/test_logical_data.cpp create mode 100644 c/experimental/stf/test/test_places.cpp create mode 100644 c/experimental/stf/test/test_task.cpp create mode 100644 c/experimental/stf/test/test_token.cpp diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 7f1dbf4507b..364494da7a0 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(parallel) +add_subdirectory(experimental/stf/) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt new file mode 100644 index 00000000000..11599edaec7 --- /dev/null +++ b/c/experimental/stf/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.21) + +project(CCCL_C_EXPERIMENTAL_STF LANGUAGES CUDA CXX C) + +option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tests." OFF) + +# FIXME Ideally this would be handled by presets and install rules, but for now +# consumers may override this to control the target location of cccl.c.experimental.stf. +set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library") +mark_as_advanced(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) + +file(GLOB_RECURSE srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + "src/*.cu" "src/*.cuh" +) + +add_library(cccl.c.experimental.stf SHARED ${srcs}) +set_property(TARGET cccl.c.experimental.stf PROPERTY POSITION_INDEPENDENT_CODE ON) +cccl_configure_target(cccl.c.experimental.stf DIALECT 17) + +# Override the properties set by cccl_configure_target: +if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) + set_target_properties(cccl.c.experimental.stf PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" + ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}" + ) +endif() + +find_package(CUDAToolkit REQUIRED) +set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) +target_link_libraries(cccl.c.experimental.stf PRIVATE + CUDA::cudart_static + CUDA::nvrtc + CUDA::nvJitLink + CUDA::cuda_driver + CCCL::cudax +) + +target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--expt-relaxed-constexpr>) +target_compile_options(cccl.c.experimental.stf PRIVATE $<$:--extended-lambda>) + +target_include_directories(cccl.c.experimental.stf PUBLIC "include") +target_include_directories(cccl.c.experimental.stf PRIVATE "src") + +if (CCCL_C_Parallel_ENABLE_TESTING) + add_subdirectory(test) +endif() + +# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING) +# include(cmake/CParallelHeaderTesting.cmake) +# endif() diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h new file mode 100644 index 00000000000..6f2f903e6c8 --- /dev/null +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -0,0 +1,224 @@ +#include +#include +#include + +// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum stf_access_mode +{ + STF_NONE = 0, + STF_READ = 1 << 0, + STF_WRITE = 1 << 1, + STF_RW = STF_READ | STF_WRITE +} stf_access_mode; + +struct stf_exec_place_device +{ + int dev_id; +}; + +struct stf_exec_place_host +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +typedef enum stf_exec_place_kind +{ + STF_EXEC_PLACE_DEVICE, + STF_EXEC_PLACE_HOST +} stf_exec_place_kind; + +struct stf_exec_place +{ + enum stf_exec_place_kind kind; + union + { + struct stf_exec_place_device device; + struct stf_exec_place_host host; + } u; +}; + +static inline struct stf_exec_place make_device_place(int dev_id) +{ + struct stf_exec_place p; + p.kind = STF_EXEC_PLACE_DEVICE; + p.u.device.dev_id = dev_id; + return p; +} + +static inline struct stf_exec_place make_host_place() +{ + struct stf_exec_place p; + p.kind = STF_EXEC_PLACE_HOST; + p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +typedef struct stf_exec_place_device stf_exec_place_device; +typedef struct stf_exec_place_host stf_exec_place_host; +typedef union stf_exec_place_u stf_exec_place_u; +typedef struct stf_exec_place stf_exec_place; + +struct stf_data_place_device +{ + int dev_id; +}; + +struct stf_data_place_host +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +struct stf_data_place_managed +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +struct stf_data_place_affine +{ + char dummy; /* dummy to keep it standard C which does not allow empty structs */ +}; + +typedef enum stf_data_place_kind +{ + STF_DATA_PLACE_DEVICE, + STF_DATA_PLACE_HOST, + STF_DATA_PLACE_MANAGED, + STF_DATA_PLACE_AFFINE +} stf_data_place_kind; + +struct stf_data_place +{ + enum stf_data_place_kind kind; + union + { + struct stf_data_place_device device; + struct stf_data_place_host host; + struct stf_data_place_managed managed; + struct stf_data_place_affine affine; + } u; +}; + +static inline struct stf_data_place make_device_data_place(int dev_id) +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_DEVICE; + p.u.device.dev_id = dev_id; + return p; +} + +static inline struct stf_data_place make_host_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_HOST; + p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +static inline struct stf_data_place make_managed_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_MANAGED; + p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +static inline struct stf_data_place make_affine_data_place() +{ + struct stf_data_place p; + p.kind = STF_DATA_PLACE_AFFINE; + p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */ + return p; +} + +typedef struct stf_data_place_device stf_data_place_device; +typedef struct stf_data_place_host stf_data_place_host; +typedef struct stf_data_place_managed stf_data_place_managed; +typedef struct stf_data_place_affine stf_data_place_affine; +typedef union stf_data_place_u stf_data_place_u; +typedef struct stf_data_place stf_data_place; + +typedef struct stf_ctx_handle_t* stf_ctx_handle; + +void stf_ctx_create(stf_ctx_handle* ctx); +// TODO stf_ctx_create_with_flags and an enum instead ? +void stf_ctx_create_graph(stf_ctx_handle* ctx); +void stf_ctx_finalize(stf_ctx_handle ctx); + +// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ... +// TODO stf_ctx_is_graph() + +cudaStream_t stf_fence(stf_ctx_handle ctx); + +typedef struct stf_logical_data_handle_t* stf_logical_data_handle; + +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); +void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); +void stf_logical_data_destroy(stf_logical_data_handle ld); +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to); + +// TODO +// void stf_logical_data_wait(stf_logical_data_handle ld); + +void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); + +typedef struct stf_task_handle_t* stf_task_handle; + +void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); +void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p); +void stf_task_set_symbol(stf_task_handle t, const char* symbol); +void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); +void stf_task_add_dep_with_dplace( + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p); +void stf_task_start(stf_task_handle t); +void stf_task_end(stf_task_handle t); +CUstream stf_task_get_custream(stf_task_handle t); +void* stf_task_get(stf_task_handle t, int submitted_index); +void stf_task_destroy(stf_task_handle t); +void stf_task_enable_capture(stf_task_handle t); + +typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; + +void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p); +void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); +void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); +void stf_cuda_kernel_start(stf_cuda_kernel_handle k); + +void stf_cuda_kernel_add_desc_cufunc( + stf_cuda_kernel_handle k, + CUfunction cufunc, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args); + +/* Convert CUDA kernel address to CUfunction because we may use them from a + * shared library where this would be invalid in the runtime API. */ +static inline void stf_cuda_kernel_add_desc( + stf_cuda_kernel_handle k, + const void* func, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args) +{ + CUfunction cufunc; + [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); + assert(res == cudaSuccess); + + stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); +} + +void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index); +void stf_cuda_kernel_end(stf_cuda_kernel_handle k); +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t); + +#ifdef __cplusplus +} +#endif diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu new file mode 100644 index 00000000000..0a92d86b677 --- /dev/null +++ b/c/experimental/stf/src/stf.cu @@ -0,0 +1,297 @@ +#include +// #include +#include + +using namespace cuda::experimental::stf; + +extern "C" { + +struct stf_ctx_handle_t +{ + context ctx; +}; + +struct stf_logical_data_handle_t +{ + // XXX should we always store a logical_data> instead ? + logical_data_untyped ld; +}; + +struct stf_task_handle_t +{ + context::unified_task<> t; +}; + +void stf_ctx_create(stf_ctx_handle* ctx) +{ + assert(ctx); + *ctx = new stf_ctx_handle_t{context{}}; +} + +void stf_ctx_create_graph(stf_ctx_handle* ctx) +{ + assert(ctx); + *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; +} + +void stf_ctx_finalize(stf_ctx_handle ctx) +{ + ctx->ctx.finalize(); + assert(ctx); + delete ctx; +} + +cudaStream_t stf_fence(stf_ctx_handle ctx) +{ + assert(ctx); + return ctx->ctx.fence(); +} + +void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) +{ + assert(ld); + assert(ctx); + + // Create a slice logical data + auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz)); + + // Stored in its untyped version + *ld = new stf_logical_data_handle_t{ld_typed}; +} + +void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) +{ + assert(ld); + ld->ld.set_symbol(symbol); +} + +void stf_logical_data_destroy(stf_logical_data_handle ld) +{ + assert(ld); + delete ld; +} + +void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to) +{ + assert(ctx); + assert(to); + + auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); + *to = new stf_logical_data_handle_t{ld_typed}; +} + +void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) +{ + assert(ctx); + assert(ld); + + *ld = new stf_logical_data_handle_t{ctx->ctx.token()}; +} + +/* Convert the C-API stf_exec_place to a C++ exec_place object */ +exec_place to_exec_place(struct stf_exec_place* exec_p) +{ + if (exec_p->kind == STF_EXEC_PLACE_HOST) + { + return exec_place::host(); + } + + assert(exec_p->kind == STF_EXEC_PLACE_DEVICE); + return exec_place::device(exec_p->u.device.dev_id); +} + +/* Convert the C-API stf_data_place to a C++ data_place object */ +data_place to_data_place(struct stf_data_place* data_p) +{ + assert(data_p); + + if (data_p->kind == STF_DATA_PLACE_HOST) + { + return data_place::host(); + } + + if (data_p->kind == STF_DATA_PLACE_MANAGED) + { + return data_place::managed(); + } + + if (data_p->kind == STF_DATA_PLACE_AFFINE) + { + return data_place::affine(); + } + + assert(data_p->kind == STF_DATA_PLACE_DEVICE); + return data_place::device(data_p->u.device.dev_id); +} + +void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) +{ + assert(t); + assert(ctx); + + *t = new stf_task_handle_t{ctx->ctx.task()}; +} + +void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p) +{ + assert(t); + t->t.set_exec_place(to_exec_place(exec_p)); +} + +void stf_task_set_symbol(stf_task_handle t, const char* symbol) +{ + assert(t); + assert(symbol); + + t->t.set_symbol(symbol); +} + +void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) +{ + assert(t); + assert(ld); + + t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); +} + +void stf_task_add_dep_with_dplace( + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p) +{ + assert(t); + assert(ld); + assert(data_p); + + t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p))); +} + +void* stf_task_get(stf_task_handle t, int index) +{ + assert(t); + auto s = t->t.template get>(index); + return (void*) s.data_handle(); +} + +void stf_task_start(stf_task_handle t) +{ + assert(t); + t->t.start(); +} + +void stf_task_end(stf_task_handle t) +{ + assert(t); + t->t.end(); +} + +void stf_task_enable_capture(stf_task_handle t) +{ + assert(t); + t->t.enable_capture(); +} + +CUstream stf_task_get_custream(stf_task_handle t) +{ + assert(t); + return (CUstream) t->t.get_stream(); +} + +void stf_task_destroy(stf_task_handle t) +{ + assert(t); + delete t; +} + +/** + * Low level example of cuda_kernel(_chain) + * auto t = ctx.cuda_kernel_chain(); + t.add_deps(lX.read()); + t.add_deps(lY.rw()); + t->*[&]() { + auto dX = t.template get>(0); + auto dY = t.template get>(1); + return std::vector { + { axpy, 16, 128, 0, alpha, dX, dY }, + { axpy, 16, 128, 0, beta, dX, dY }, + { axpy, 16, 128, 0, gamma, dX, dY } + }; + }; + + * + */ +struct stf_cuda_kernel_handle_t +{ + // return type of ctx.cuda_kernel() + using kernel_type = decltype(::std::declval().cuda_kernel()); + kernel_type k; +}; + +void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) +{ + assert(k); + assert(ctx); + + *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; +} + +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p) +{ + assert(k); + k->k.set_exec_place(to_exec_place(exec_p)); +} + +void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) +{ + assert(k); + assert(symbol); + + k->k.set_symbol(symbol); +} + +void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m) +{ + assert(k); + assert(ld); + + k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m))); +} + +void stf_cuda_kernel_start(stf_cuda_kernel_handle k) +{ + assert(k); + k->k.start(); +} + +void stf_cuda_kernel_add_desc_cufunc( + stf_cuda_kernel_handle k, + CUfunction cufunc, + dim3 gridDim_, + dim3 blockDim_, + size_t sharedMem_, + int arg_cnt, + const void** args) +{ + cuda_kernel_desc desc; + desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + + k->k.add_kernel_desc(mv(desc)); +} + +void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index) +{ + auto s = k->k.template get>(index); + return (void*) s.data_handle(); +} + +void stf_cuda_kernel_end(stf_cuda_kernel_handle k) +{ + assert(k); + k->k.end(); +} + +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) +{ + assert(t); + delete t; +} + +} // extern "C" diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt new file mode 100644 index 00000000000..f5613253a81 --- /dev/null +++ b/c/experimental/stf/test/CMakeLists.txt @@ -0,0 +1,39 @@ +cccl_get_c2h() + +function(cccl_c_experimental_stf_add_test target_name_var source) + string(REGEX REPLACE "test_([^.]*)" "cccl.c.experimental.stf.test.\\1" target_name "${source}") + set(target_name_var ${target_name} PARENT_SCOPE) + + add_executable(${target_name} "${source}") + cccl_configure_target(${target_name} DIALECT 20) + + set_target_properties(${target_name} PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) + target_link_libraries(${target_name} PRIVATE + cccl.c.experimental.stf + CUDA::cudart_static + CUDA::nvrtc + cccl.c2h.main + cccl.compiler_interface_cpp20 + CUDA::cuda_driver + CCCL::cudax + ) + + target_compile_definitions(${target_name} PRIVATE + TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust" + TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" + TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}" + ) + + add_test(NAME ${target_name} COMMAND ${target_name}) +endfunction() + +file(GLOB test_srcs + RELATIVE "${CMAKE_CURRENT_LIST_DIR}" + CONFIGURE_DEPENDS + *.cu *.cpp +) + +foreach(test_src IN LISTS test_srcs) + cccl_c_experimental_stf_add_test(test_target "${test_src}") +endforeach() diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp new file mode 100644 index 00000000000..86225ad91c7 --- /dev/null +++ b/c/experimental/stf/test/test_ctx.cpp @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("basic stf context", "[context]") +{ + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + stf_ctx_finalize(ctx); +} diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu new file mode 100644 index 00000000000..b5ba66b0f3a --- /dev/null +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +__global__ void axpy(int cnt, double a, const double* x, double* y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + for (int i = tid; i < cnt; i += nthreads) + { + y[i] += a * x[i]; + } +} + +double X0(int i) +{ + return sin((double) i); +} + +double Y0(int i) +{ + return cos((double) i); +} + +C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY; + + double *X, *Y; + X = (double*) malloc(N * sizeof(double)); + Y = (double*) malloc(N * sizeof(double)); + + for (size_t i = 0; i < N; i++) + { + X[i] = X0(i); + Y[i] = Y0(i); + } + + const double alpha = 3.14; + + stf_logical_data(ctx, &lX, X, N * sizeof(double)); + stf_logical_data(ctx, &lY, Y, N * sizeof(double)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + + stf_cuda_kernel_handle k; + stf_cuda_kernel_create(ctx, &k); + stf_cuda_kernel_set_symbol(k, "axpy"); + stf_cuda_kernel_add_dep(k, lX, STF_READ); + stf_cuda_kernel_add_dep(k, lY, STF_RW); + stf_cuda_kernel_start(k); + double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); + double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); + const void* args[4] = {&N, &alpha, &dX, &dY}; + stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + stf_cuda_kernel_end(k); + stf_cuda_kernel_destroy(k); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + + stf_ctx_finalize(ctx); + + for (size_t i = 0; i < N; i++) + { + assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); + assert(fabs(X[i] - X0(i)) < 0.0001); + } + + free(X); + free(Y); +} diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp new file mode 100644 index 00000000000..168ca8dabbc --- /dev/null +++ b/c/experimental/stf/test/test_logical_data.cpp @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("basic stf logical_data", "[logical_data]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lA, lB; + + float *A, *B; + A = (float*) malloc(N * sizeof(float)); + B = (float*) malloc(N * sizeof(float)); + + stf_logical_data(ctx, &lA, A, N * sizeof(float)); + stf_logical_data(ctx, &lB, B, N * sizeof(float)); + + stf_logical_data_destroy(lA); + stf_logical_data_destroy(lB); + + stf_ctx_finalize(ctx); + + free(A); + free(B); +} diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp new file mode 100644 index 00000000000..eeba229c758 --- /dev/null +++ b/c/experimental/stf/test/test_places.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("empty stf tasks", "[task]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); + + stf_logical_data(ctx, &lX, X, N * sizeof(float)); + stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + stf_logical_data(ctx, &lZ, Z, N * sizeof(float)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + auto e_place_dev0 = make_device_place(0); + stf_task_set_exec_place(t3, &e_place_dev0); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + auto d_place_dev0 = make_device_data_place(0); + stf_task_add_dep_with_dplace(t4, lZ, STF_RW, &d_place_dev0); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp new file mode 100644 index 00000000000..80266f6b381 --- /dev/null +++ b/c/experimental/stf/test/test_task.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("empty stf tasks", "[task]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); + + stf_logical_data(ctx, &lX, X, N * sizeof(float)); + stf_logical_data(ctx, &lY, Y, N * sizeof(float)); + stf_logical_data(ctx, &lZ, Z, N * sizeof(float)); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + stf_task_add_dep(t4, lZ, STF_RW); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} diff --git a/c/experimental/stf/test/test_token.cpp b/c/experimental/stf/test/test_token.cpp new file mode 100644 index 00000000000..ccd7f0a9e2c --- /dev/null +++ b/c/experimental/stf/test/test_token.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +C2H_TEST("stf token", "[token]") +{ + size_t N = 1000000; + + stf_ctx_handle ctx; + stf_ctx_create(&ctx); + + stf_logical_data_handle lX, lY, lZ; + + float *X, *Y, *Z; + X = (float*) malloc(N * sizeof(float)); + Y = (float*) malloc(N * sizeof(float)); + Z = (float*) malloc(N * sizeof(float)); + + stf_token(ctx, &lX); + stf_token(ctx, &lY); + stf_token(ctx, &lZ); + + stf_logical_data_set_symbol(lX, "X"); + stf_logical_data_set_symbol(lY, "Y"); + stf_logical_data_set_symbol(lZ, "Z"); + + stf_task_handle t1; + stf_task_create(ctx, &t1); + stf_task_set_symbol(t1, "T1"); + stf_task_add_dep(t1, lX, STF_RW); + stf_task_start(t1); + stf_task_end(t1); + + stf_task_handle t2; + stf_task_create(ctx, &t2); + stf_task_set_symbol(t2, "T2"); + stf_task_add_dep(t2, lX, STF_READ); + stf_task_add_dep(t2, lY, STF_RW); + stf_task_start(t2); + stf_task_end(t2); + + stf_task_handle t3; + stf_task_create(ctx, &t3); + stf_task_set_symbol(t3, "T3"); + stf_task_add_dep(t3, lX, STF_READ); + stf_task_add_dep(t3, lZ, STF_RW); + stf_task_start(t3); + stf_task_end(t3); + + stf_task_handle t4; + stf_task_create(ctx, &t4); + stf_task_set_symbol(t4, "T4"); + stf_task_add_dep(t4, lY, STF_READ); + stf_task_add_dep(t4, lZ, STF_RW); + stf_task_start(t4); + stf_task_end(t4); + + stf_logical_data_destroy(lX); + stf_logical_data_destroy(lY); + stf_logical_data_destroy(lZ); + + stf_ctx_finalize(ctx); + + free(X); + free(Y); + free(Z); +} From 522b630d7e652060970cc9efce3bbe22145985ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 15:25:39 +0200 Subject: [PATCH 140/221] remove dead code --- c/experimental/stf/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 11599edaec7..40f3fafffd7 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -46,7 +46,3 @@ target_include_directories(cccl.c.experimental.stf PRIVATE "src") if (CCCL_C_Parallel_ENABLE_TESTING) add_subdirectory(test) endif() - -# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING) -# include(cmake/CParallelHeaderTesting.cmake) -# endif() From 43153141a20a1be102cc0c258a3566930fe229da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 15:29:28 +0200 Subject: [PATCH 141/221] do define and use CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING --- CMakePresets.json | 15 +++++++++++++-- c/experimental/stf/CMakeLists.txt | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index b39ab345fc9..63f06ffa281 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -87,7 +87,8 @@ "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true, "CCCL_C_Parallel_ENABLE_TESTING": true, - "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } }, { @@ -270,7 +271,17 @@ "cacheVariables": { "CCCL_ENABLE_C": true, "CCCL_C_Parallel_ENABLE_TESTING": true, - "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true + } + }, + { + "name": "cccl-c-stf", + "displayName": "CCCL C CUDASTF Library", + "inherits": "base", + "cacheVariables": { + "CCCL_ENABLE_C": true, + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } }, { diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 40f3fafffd7..58e78427727 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -43,6 +43,6 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$ Date: Tue, 2 Sep 2025 15:36:47 +0200 Subject: [PATCH 142/221] Add CUDASTF C lib to tests --- ci/build_cccl_c_stf.sh | 15 +++++++++++++++ ci/matrix.yaml | 5 +++++ ci/test_cccl_c_stf.sh | 13 +++++++++++++ 3 files changed, 33 insertions(+) create mode 100755 ci/build_cccl_c_stf.sh create mode 100755 ci/test_cccl_c_stf.sh diff --git a/ci/build_cccl_c_stf.sh b/ci/build_cccl_c_stf.sh new file mode 100755 index 00000000000..9fcc8559287 --- /dev/null +++ b/ci/build_cccl_c_stf.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail + +source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh" + +print_environment_details + +PRESET="cccl-c-stf" + +CMAKE_OPTIONS="" + +configure_and_build_preset "CCCL C CUDASTF Library" "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 3f003633eda..14529c9fcfd 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -54,6 +54,7 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']} + - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6: # - {jobs: ['test'], project: 'python', ctk: ['12.5', 'curr'], py_version: ['3.10', '3.13'], gpu: 'l4'} # These two can be removed once the above is working: @@ -115,6 +116,7 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang'], gpu: 'rtx2080'} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']} + - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6: # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'} # These two can be removed once the above is working. @@ -328,6 +330,9 @@ projects: cccl_c_parallel: name: 'CCCL C Parallel' stds: [20] + cccl_c_stf: + name: 'CCCL C CUDASTF' + stds: [20] # testing -> Runner with GPU is in a nv-gh-runners testing pool gpus: diff --git a/ci/test_cccl_c_stf.sh b/ci/test_cccl_c_stf.sh new file mode 100755 index 00000000000..090e341292a --- /dev/null +++ b/ci/test_cccl_c_stf.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh" + +print_environment_details + +./build_cccl_c_stf.sh "$@" + +PRESET="cccl-c-stf" + +test_preset "CCCL C Parallel Library" ${PRESET} + +print_time_summary From c87cdaa6414c1d7fb27e692553c79f8d51ffe4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 15:50:29 +0200 Subject: [PATCH 143/221] Add missing headers --- .../stf/include/cccl/c/experimental/stf/stf.h | 16 ++++++++++++++++ c/experimental/stf/src/stf.cu | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 6f2f903e6c8..9ab82e1b213 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1,3 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL + #include #include #include diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 0a92d86b677..e300d427105 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // #include #include From 02a9eb6c5f50166f7816a3be03e069d9c1dd315d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 15:53:38 +0200 Subject: [PATCH 144/221] use snake_case --- .../stf/include/cccl/c/experimental/stf/stf.h | 14 +++++++------- c/experimental/stf/src/stf.cu | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 9ab82e1b213..7768c132feb 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -207,9 +207,9 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k); void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args); @@ -218,9 +218,9 @@ void stf_cuda_kernel_add_desc_cufunc( static inline void stf_cuda_kernel_add_desc( stf_cuda_kernel_handle k, const void* func, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args) { @@ -228,7 +228,7 @@ static inline void stf_cuda_kernel_add_desc( [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); assert(res == cudaSuccess); - stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); } void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index e300d427105..6531f0d6289 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -274,14 +274,14 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k) void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args) { cuda_kernel_desc desc; - desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); k->k.add_kernel_desc(mv(desc)); } From 232133b9600258881ea6e2eaf88e311470260fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 16:26:46 +0200 Subject: [PATCH 145/221] Do define CCCL_C_EXPERIMENTAL=1 --- c/experimental/stf/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 58e78427727..85f9bdb4c34 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -29,6 +29,7 @@ endif() find_package(CUDAToolkit REQUIRED) set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) +target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cudart_static CUDA::nvrtc From b60eb6b1006cc93c8c519271fa85adf5fbfc5e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 16:49:36 +0200 Subject: [PATCH 146/221] Do not do redundant tests --- CMakePresets.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakePresets.json b/CMakePresets.json index 63f06ffa281..876b02acc6d 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -272,7 +272,7 @@ "CCCL_ENABLE_C": true, "CCCL_C_Parallel_ENABLE_TESTING": true, "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, - "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false } }, { @@ -281,6 +281,8 @@ "inherits": "base", "cacheVariables": { "CCCL_ENABLE_C": true, + "CCCL_C_Parallel_ENABLE_TESTING": false, + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false, "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } }, From c4c99f01474ac6e46466438257d1d5c29cfa27c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 16:54:46 +0200 Subject: [PATCH 147/221] Add a project to ci/inspect_changes.sh --- ci/inspect_changes.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh index 4ccf4a6fc70..7c7dfab4580 100755 --- a/ci/inspect_changes.sh +++ b/ci/inspect_changes.sh @@ -63,6 +63,7 @@ declare -A dependencies=( [stdpar]="cccl libcudacxx cub thrust" [python]="cccl libcudacxx cub cccl_c_parallel" [cccl_c_parallel]="cccl libcudacxx cub thrust c2h" + [cccl_c_stf]="cccl libcudacxx cudax c2h" [c2h]="cccl libcudacxx cub thrust" [nvbench_helper]="cccl libcudacxx cub thrust" ) @@ -77,6 +78,7 @@ declare -A project_names=( [stdpar]="stdpar" [python]="python" [cccl_c_parallel]="CCCL C Parallel Library" + [cccl_c_stf]="CCCL C CUDASTF Library" [c2h]="Catch2Helper" [nvbench_helper]="NVBench Helper" ) From 2f5925b166962cf7236ad2ee789cf5b738d7cd50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 16:59:50 +0200 Subject: [PATCH 148/221] missing changes in previous commit --- ci/inspect_changes.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh index 7c7dfab4580..f48b82270bd 100755 --- a/ci/inspect_changes.sh +++ b/ci/inspect_changes.sh @@ -47,6 +47,7 @@ subprojects=( stdpar python cccl_c_parallel + cccl_c_stf c2h nvbench_helper ) @@ -92,6 +93,7 @@ declare -A project_names=( declare -A project_dirs=( [packaging]='("examples" "test/cmake")' [cccl_c_parallel]='("c/parallel")' + [cccl_c_stf]='("c/experimental/stf")' [stdpar]='("test/stdpar")' ) From 3417075f27c17d76b1378713d08ab8d55dffd282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 2 Sep 2025 17:11:31 +0200 Subject: [PATCH 149/221] add presets --- CMakePresets.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakePresets.json b/CMakePresets.json index 876b02acc6d..1fe5025ba4d 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -439,6 +439,10 @@ "name": "cccl-c-parallel", "configurePreset": "cccl-c-parallel" }, + { + "name": "cccl-c-stf", + "configurePreset": "cccl-c-stf" + }, { "name": "packaging", "configurePreset": "packaging" @@ -732,6 +736,11 @@ "configurePreset": "cccl-c-parallel", "inherits": "base" }, + { + "name": "cccl-c-stf", + "configurePreset": "cccl-c-stf", + "inherits": "base" + }, { "name": "packaging", "configurePreset": "packaging", From 8c05034efad0460969f701bd92aa1b13a9072b7c Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 2 Sep 2025 15:32:04 +0000 Subject: [PATCH 150/221] Add override matrix --- ci/matrix.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 14529c9fcfd..d1ed0fb08b1 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -8,6 +8,11 @@ workflows: # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']} # override: + # Python and c/parallel jobs: + - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']} + - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']} + - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'} + - {jobs: ['test_py_par'], ctk: ['12.8', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'} pull_request: # Old CTK/compiler @@ -54,7 +59,7 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']} - - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} + - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6: # - {jobs: ['test'], project: 'python', ctk: ['12.5', 'curr'], py_version: ['3.10', '3.13'], gpu: 'l4'} # These two can be removed once the above is working: @@ -116,7 +121,7 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang'], gpu: 'rtx2080'} # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']} - - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} + - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']} # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6: # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'} # These two can be removed once the above is working. From 20faa8ff744f9320871e32cc70d711b04766ffd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 08:34:20 +0200 Subject: [PATCH 151/221] Properly define structs with a typedef and remove superfluous struct keywords --- .../stf/include/cccl/c/experimental/stf/stf.h | 80 ++++++++----------- c/experimental/stf/src/stf.cu | 10 +-- 2 files changed, 39 insertions(+), 51 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 7768c132feb..357baff4ed8 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -31,15 +31,15 @@ typedef enum stf_access_mode STF_RW = STF_READ | STF_WRITE } stf_access_mode; -struct stf_exec_place_device +typedef struct stf_exec_place_device { int dev_id; -}; +} stf_exec_place_device; -struct stf_exec_place_host +typedef struct stf_exec_place_host { char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; +} stf_exec_place_host; typedef enum stf_exec_place_kind { @@ -47,56 +47,51 @@ typedef enum stf_exec_place_kind STF_EXEC_PLACE_HOST } stf_exec_place_kind; -struct stf_exec_place +typedef struct stf_exec_place { enum stf_exec_place_kind kind; union { - struct stf_exec_place_device device; - struct stf_exec_place_host host; + stf_exec_place_device device; + stf_exec_place_host host; } u; -}; +} stf_exec_place; -static inline struct stf_exec_place make_device_place(int dev_id) +static inline stf_exec_place make_device_place(int dev_id) { - struct stf_exec_place p; + stf_exec_place p; p.kind = STF_EXEC_PLACE_DEVICE; p.u.device.dev_id = dev_id; return p; } -static inline struct stf_exec_place make_host_place() +static inline stf_exec_place make_host_place() { - struct stf_exec_place p; + stf_exec_place p; p.kind = STF_EXEC_PLACE_HOST; p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } -typedef struct stf_exec_place_device stf_exec_place_device; -typedef struct stf_exec_place_host stf_exec_place_host; -typedef union stf_exec_place_u stf_exec_place_u; -typedef struct stf_exec_place stf_exec_place; - -struct stf_data_place_device +typedef struct stf_data_place_device { int dev_id; -}; +} stf_data_place_device; -struct stf_data_place_host +typedef struct stf_data_place_host { char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; +} stf_data_place_host; -struct stf_data_place_managed +typedef struct stf_data_place_managed { char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; +} stf_data_place_managed; -struct stf_data_place_affine +typedef struct stf_data_place_affine { char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; +} stf_data_place_affine; typedef enum stf_data_place_kind { @@ -106,21 +101,21 @@ typedef enum stf_data_place_kind STF_DATA_PLACE_AFFINE } stf_data_place_kind; -struct stf_data_place +typedef struct stf_data_place { enum stf_data_place_kind kind; union { - struct stf_data_place_device device; - struct stf_data_place_host host; - struct stf_data_place_managed managed; - struct stf_data_place_affine affine; + stf_data_place_device device; + stf_data_place_host host; + stf_data_place_managed managed; + stf_data_place_affine affine; } u; -}; +} stf_data_place; -static inline struct stf_data_place make_device_data_place(int dev_id) +static inline stf_data_place make_device_data_place(int dev_id) { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_DEVICE; p.u.device.dev_id = dev_id; return p; @@ -128,7 +123,7 @@ static inline struct stf_data_place make_device_data_place(int dev_id) static inline struct stf_data_place make_host_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_HOST; p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ return p; @@ -136,7 +131,7 @@ static inline struct stf_data_place make_host_data_place() static inline struct stf_data_place make_managed_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_MANAGED; p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */ return p; @@ -144,19 +139,12 @@ static inline struct stf_data_place make_managed_data_place() static inline struct stf_data_place make_affine_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_AFFINE; p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } -typedef struct stf_data_place_device stf_data_place_device; -typedef struct stf_data_place_host stf_data_place_host; -typedef struct stf_data_place_managed stf_data_place_managed; -typedef struct stf_data_place_affine stf_data_place_affine; -typedef union stf_data_place_u stf_data_place_u; -typedef struct stf_data_place stf_data_place; - typedef struct stf_ctx_handle_t* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); @@ -184,11 +172,11 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); typedef struct stf_task_handle_t* stf_task_handle; void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); -void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p); +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); void stf_task_set_symbol(stf_task_handle t, const char* symbol); void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); void stf_task_add_dep_with_dplace( - stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p); + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p); void stf_task_start(stf_task_handle t); void stf_task_end(stf_task_handle t); CUstream stf_task_get_custream(stf_task_handle t); @@ -199,7 +187,7 @@ void stf_task_enable_capture(stf_task_handle t); typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p); +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); void stf_cuda_kernel_start(stf_cuda_kernel_handle k); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 6531f0d6289..60a24710829 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -99,7 +99,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) } /* Convert the C-API stf_exec_place to a C++ exec_place object */ -exec_place to_exec_place(struct stf_exec_place* exec_p) +exec_place to_exec_place(stf_exec_place* exec_p) { if (exec_p->kind == STF_EXEC_PLACE_HOST) { @@ -111,7 +111,7 @@ exec_place to_exec_place(struct stf_exec_place* exec_p) } /* Convert the C-API stf_data_place to a C++ data_place object */ -data_place to_data_place(struct stf_data_place* data_p) +data_place to_data_place(stf_data_place* data_p) { assert(data_p); @@ -142,7 +142,7 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) *t = new stf_task_handle_t{ctx->ctx.task()}; } -void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p) +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) { assert(t); t->t.set_exec_place(to_exec_place(exec_p)); @@ -165,7 +165,7 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ } void stf_task_add_dep_with_dplace( - stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p) + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p) { assert(t); assert(ld); @@ -243,7 +243,7 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; } -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p) +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p) { assert(k); k->k.set_exec_place(to_exec_place(exec_p)); From 8c5e760326a6c2731f4c5b8b34ef0e42c63d37db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 09:29:55 +0200 Subject: [PATCH 152/221] fix previous merge --- ci/matrix.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 3d9c82c5ce2..c03ffe5e9cc 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -11,8 +11,6 @@ workflows: # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']} - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']} - - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'} - - {jobs: ['test_py_par'], ctk: ['12.8', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'} pull_request: # Old CTK: Oldest/newest supported host compilers: From 78dc197d8bd9f284f0827433b6024459946691c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 10:18:00 +0200 Subject: [PATCH 153/221] Change tensor_arguments to return an element instead of a tuple of one element, and use this feature in examples. Also add the equivalent for NUMBA and use it in examples --- .../cccl/experimental/stf/_stf_bindings_impl.pyx | 8 ++++++++ python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 6 +++--- python/cuda_cccl/tests/stf/test_fhe.py | 16 +++++----------- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 2 +- python/cuda_cccl/tests/stf/test_numba.py | 12 +++++------- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 3b4ba4db39b..ceca943e155 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -441,6 +441,12 @@ cdef class task: raise RuntimeError("numba support is not available") from e return cai_to_numba(cai) + def numba_arguments(self): + arg_cnt=len(self._lds_args) + if arg_cnt == 1: + return self.get_arg_numba(0) + return tuple(self.get_arg_numba(i) for i in range(arg_cnt)) + def get_arg_as_tensor(self, index): cai = self.get_arg_cai(index) try: @@ -451,6 +457,8 @@ cdef class task: def tensor_arguments(self): arg_cnt=len(self._lds_args) + if arg_cnt == 1: + return self.get_arg_as_tensor(0) return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt)) # ---- context‑manager helpers ------------------------------- diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index ccac389d6a6..3d1d6b0d2df 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -49,7 +49,7 @@ def init_field(ctx, ld, value): ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - field = t.get_arg_as_tensor(0) + field = t.tensor_arguments() if value == 0: field.zero_() else: @@ -173,7 +173,7 @@ def source(t: float, x: float, y: float, z: float) -> float: ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - ez = t.get_arg_as_tensor(0) + ez = t.tensor_arguments() ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) # ------------------------- @@ -222,7 +222,7 @@ def source(t: float, x: float, y: float, z: float) -> float: ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - ez = t.get_arg_as_tensor(0) + ez = t.tensor_arguments() print(f"{n}\t{ez[cx, cy, cz].item():.6e}") show_slice(ez, plane="xy") pass diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 9ec86af51c8..e2d38308341 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -37,7 +37,7 @@ def print_values(self): cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - hvalues = t.get_arg_numba(0) + hvalues = t.numba_arguments() print([v for v in hvalues]) @@ -85,8 +85,7 @@ def __invert__(self): with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - da = t.get_arg_numba(0) - dresult = t.get_arg_numba(1) + da, dresult = t.numba_arguments() not_kernel[32, 16, nb_stream](da, dresult) return result @@ -100,9 +99,7 @@ def __or__(self, other): with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - da = t.get_arg_numba(0) - db = t.get_arg_numba(1) - dresult = t.get_arg_numba(2) + da, db, dresult = t.numba_arguments() or_kernel[32, 16, nb_stream](da, db, dresult) return result @@ -117,9 +114,7 @@ def __and__(self, other): with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) nb_stream.synchronize() - da = t.get_arg_numba(0) - db = t.get_arg_numba(1) - dresult = t.get_arg_numba(2) + da, db, dresult = t.numba_arguments() and_kernel[32, 16, nb_stream](da, db, dresult) return result @@ -133,8 +128,7 @@ def decrypt(self): with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - da = t.get_arg_numba(0) - dresult = t.get_arg_numba(1) + da, dresult = t.numba_arguments() # reverse the toy XOR "encryption" xor_kernel[32, 16, nb_stream](da, dresult, 0x42) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index bb369b6f250..8adbf5454ed 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -37,7 +37,7 @@ def print_values(self): cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - hvalues = t.get_arg_numba(0) + hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 35fb749c68c..a992d26d7d2 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -42,7 +42,7 @@ def test_numba_graph(): lX = ctx.logical_data(X) with ctx.task(rw(lX)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - dX = t.get_arg_numba(0) + dX = t.numba_arguments() scale[32, 64, nb_stream](2.0, dX) ctx.finalize() @@ -61,7 +61,7 @@ def test_numba(): with ctx.task(rw(lX)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - dX = t.get_arg_numba(0) + dX = t.numba_arguments() # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) @@ -74,14 +74,12 @@ def test_numba(): with ctx.task(read(lX), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - dX = t.get_arg_numba(0) - dZ = t.get_arg_numba(1) + dX, dZ = t.numba_arguments() axpy[32, 64, nb_stream](2.0, dX, dZ) with ctx.task(read(lY), rw(lZ)) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - dY = t.get_arg_numba(0) - dZ = t.get_arg_numba(1) + dY, dZ = t.numba_arguments() axpy[32, 64, nb_stream](2.0, dY, dZ) ctx.finalize() @@ -217,7 +215,7 @@ def test_numba_places(): with ctx.task(lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - dX = t.get_arg_numba(0) + dX = t.numba_arguments() scale[32, 64, nb_stream](2.0, dX) with ctx.task(lX.read(), lY.rw()) as t: From 2eb2ace7025cc3ca6142ce664168c5de0b2c340e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 16:23:26 +0200 Subject: [PATCH 154/221] Remove intermediate structures and use opaque pointers instead --- .../stf/include/cccl/c/experimental/stf/stf.h | 8 +- c/experimental/stf/src/stf.cu | 157 +++++++++++------- 2 files changed, 100 insertions(+), 65 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 357baff4ed8..58f6f3c8492 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -145,7 +145,7 @@ static inline struct stf_data_place make_affine_data_place() return p; } -typedef struct stf_ctx_handle_t* stf_ctx_handle; +typedef void* stf_ctx_handle; void stf_ctx_create(stf_ctx_handle* ctx); // TODO stf_ctx_create_with_flags and an enum instead ? @@ -157,7 +157,7 @@ void stf_ctx_finalize(stf_ctx_handle ctx); cudaStream_t stf_fence(stf_ctx_handle ctx); -typedef struct stf_logical_data_handle_t* stf_logical_data_handle; +typedef void* stf_logical_data_handle; void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); @@ -169,7 +169,7 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); -typedef struct stf_task_handle_t* stf_task_handle; +typedef void* stf_task_handle; void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); @@ -184,7 +184,7 @@ void* stf_task_get(stf_task_handle t, int submitted_index); void stf_task_destroy(stf_task_handle t); void stf_task_enable_capture(stf_task_handle t); -typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; +typedef void* stf_cuda_kernel_handle; void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 60a24710829..a2303a1f5a0 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -16,69 +16,60 @@ using namespace cuda::experimental::stf; extern "C" { -struct stf_ctx_handle_t -{ - context ctx; -}; - -struct stf_logical_data_handle_t -{ - // XXX should we always store a logical_data> instead ? - logical_data_untyped ld; -}; - -struct stf_task_handle_t -{ - context::unified_task<> t; -}; - void stf_ctx_create(stf_ctx_handle* ctx) { assert(ctx); - *ctx = new stf_ctx_handle_t{context{}}; + *ctx = new context{}; } void stf_ctx_create_graph(stf_ctx_handle* ctx) { assert(ctx); - *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; + *ctx = new context{graph_ctx()}; } void stf_ctx_finalize(stf_ctx_handle ctx) { - ctx->ctx.finalize(); assert(ctx); - delete ctx; + auto* context_ptr = static_cast(ctx); + context_ptr->finalize(); + delete context_ptr; } cudaStream_t stf_fence(stf_ctx_handle ctx) { assert(ctx); - return ctx->ctx.fence(); + auto* context_ptr = static_cast(ctx); + return context_ptr->fence(); } void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) { - assert(ld); assert(ctx); + assert(ld); - // Create a slice logical data - auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz)); + auto* context_ptr = static_cast(ctx); + auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz)); - // Stored in its untyped version - *ld = new stf_logical_data_handle_t{ld_typed}; + // Store the logical_data_untyped directly as opaque pointer + *ld = new logical_data_untyped{ld_typed}; } void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) { assert(ld); - ld->ld.set_symbol(symbol); + assert(symbol); + + auto* ld_ptr = static_cast(ld); + ld_ptr->set_symbol(symbol); } void stf_logical_data_destroy(stf_logical_data_handle ld) { assert(ld); - delete ld; + + auto* ld_ptr = static_cast(ld); + delete ld_ptr; } void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to) @@ -86,8 +77,9 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_ assert(ctx); assert(to); - auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); - *to = new stf_logical_data_handle_t{ld_typed}; + auto* context_ptr = static_cast(ctx); + auto ld_typed = context_ptr->logical_data(shape_of>(length)); + *to = new logical_data_untyped{ld_typed}; } void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) @@ -95,7 +87,8 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) assert(ctx); assert(ld); - *ld = new stf_logical_data_handle_t{ctx->ctx.token()}; + auto* context_ptr = static_cast(ctx); + *ld = new logical_data_untyped{context_ptr->token()}; } /* Convert the C-API stf_exec_place to a C++ exec_place object */ @@ -136,16 +129,20 @@ data_place to_data_place(stf_data_place* data_p) void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) { - assert(t); assert(ctx); + assert(t); - *t = new stf_task_handle_t{ctx->ctx.task()}; + auto* context_ptr = static_cast(ctx); + *t = new context::unified_task<>{context_ptr->task()}; } void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) { assert(t); - t->t.set_exec_place(to_exec_place(exec_p)); + assert(exec_p); + + auto* task_ptr = static_cast*>(t); + task_ptr->set_exec_place(to_exec_place(exec_p)); } void stf_task_set_symbol(stf_task_handle t, const char* symbol) @@ -153,7 +150,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol) assert(t); assert(symbol); - t->t.set_symbol(symbol); + auto* task_ptr = static_cast*>(t); + task_ptr->set_symbol(symbol); } void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) @@ -161,7 +159,9 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ assert(t); assert(ld); - t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); + auto* task_ptr = static_cast*>(t); + auto* ld_ptr = static_cast(ld); + task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m))); } void stf_task_add_dep_with_dplace( @@ -171,44 +171,58 @@ void stf_task_add_dep_with_dplace( assert(ld); assert(data_p); - t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p))); + auto* task_ptr = static_cast*>(t); + auto* ld_ptr = static_cast(ld); + task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m), to_data_place(data_p))); } void* stf_task_get(stf_task_handle t, int index) { assert(t); - auto s = t->t.template get>(index); + + auto* task_ptr = static_cast*>(t); + auto s = task_ptr->template get>(index); return (void*) s.data_handle(); } void stf_task_start(stf_task_handle t) { assert(t); - t->t.start(); + + auto* task_ptr = static_cast*>(t); + task_ptr->start(); } void stf_task_end(stf_task_handle t) { assert(t); - t->t.end(); + + auto* task_ptr = static_cast*>(t); + task_ptr->end(); } void stf_task_enable_capture(stf_task_handle t) { assert(t); - t->t.enable_capture(); + + auto* task_ptr = static_cast*>(t); + task_ptr->enable_capture(); } CUstream stf_task_get_custream(stf_task_handle t) { assert(t); - return (CUstream) t->t.get_stream(); + + auto* task_ptr = static_cast*>(t); + return (CUstream) task_ptr->get_stream(); } void stf_task_destroy(stf_task_handle t) { assert(t); - delete t; + + auto* task_ptr = static_cast*>(t); + delete task_ptr; } /** @@ -228,25 +242,24 @@ void stf_task_destroy(stf_task_handle t) * */ -struct stf_cuda_kernel_handle_t -{ - // return type of ctx.cuda_kernel() - using kernel_type = decltype(::std::declval().cuda_kernel()); - kernel_type k; -}; - void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) { - assert(k); assert(ctx); + assert(k); - *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; + auto* context_ptr = static_cast(ctx); + using kernel_type = decltype(context_ptr->cuda_kernel()); + *k = new kernel_type{context_ptr->cuda_kernel()}; } void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p) { assert(k); - k->k.set_exec_place(to_exec_place(exec_p)); + assert(exec_p); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->set_exec_place(to_exec_place(exec_p)); } void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) @@ -254,7 +267,9 @@ void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) assert(k); assert(symbol); - k->k.set_symbol(symbol); + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->set_symbol(symbol); } void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m) @@ -262,13 +277,19 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l assert(k); assert(ld); - k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m))); + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + auto* ld_ptr = static_cast(ld); + kernel_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m))); } void stf_cuda_kernel_start(stf_cuda_kernel_handle k) { assert(k); - k->k.start(); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->start(); } void stf_cuda_kernel_add_desc_cufunc( @@ -280,28 +301,42 @@ void stf_cuda_kernel_add_desc_cufunc( int arg_cnt, const void** args) { + assert(k); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + cuda_kernel_desc desc; desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); - - k->k.add_kernel_desc(mv(desc)); + kernel_ptr->add_kernel_desc(mv(desc)); } void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index) { - auto s = k->k.template get>(index); + assert(k); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + auto s = kernel_ptr->template get>(index); return (void*) s.data_handle(); } void stf_cuda_kernel_end(stf_cuda_kernel_handle k) { assert(k); - k->k.end(); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->end(); } void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) { assert(t); - delete t; + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(t); + delete kernel_ptr; } } // extern "C" From 655706726aa8e1b396b0792a6b20fd59c84f65b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 16:50:22 +0200 Subject: [PATCH 155/221] Automatically generated documentation --- .../stf/include/cccl/c/experimental/stf/stf.h | 974 +++++++++++++++++- 1 file changed, 936 insertions(+), 38 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 58f6f3c8492..7e012830408 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -8,6 +8,61 @@ // //===----------------------------------------------------------------------===// +//! \file stf.h +//! \brief CUDA STF (Sequential Task Flow) C Interface +//! +//! \details +//! This header provides a C interface to the CUDA STF C++ library, enabling +//! task-based parallel programming with automatic data movement and dependency management. +//! +//! The Sequential Task Flow programming model involves defining logical data and +//! submitting tasks that operate on this data. STF automatically deduces dependencies +//! between tasks and orchestrates both computation and data movement to ensure +//! efficient execution with maximum concurrency. +//! +//! \par Key Concepts: +//! - **Logical Data**: Abstract handles for data that may exist in multiple locations +//! - **Tasks**: Operations that consume and produce logical data with specified access modes +//! - **Dependencies**: Automatically inferred from data access patterns (RAW, WAR, WAW) +//! - **Execution Places**: Specify where tasks run (CPU, specific GPU devices) +//! - **Data Places**: Specify where data should be located in memory hierarchy +//! +//! \par Basic Usage Pattern: +//! \code +//! // 1. Create STF context +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! +//! // 2. Create logical data from arrays +//! float X[1024], Y[1024]; +//! stf_logical_data_handle lX, lY; +//! stf_logical_data(ctx, &lX, X, sizeof(X)); +//! stf_logical_data(ctx, &lY, Y, sizeof(Y)); +//! +//! // 3. Create and configure task +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! stf_task_add_dep(task, lX, STF_READ); // X is read-only +//! stf_task_add_dep(task, lY, STF_RW); // Y is read-write +//! +//! // 4. Execute task +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! float* x_ptr = (float*)stf_task_get(task, 0); +//! float* y_ptr = (float*)stf_task_get(task, 1); +//! // ... launch CUDA operations using stream ... +//! stf_task_end(task); +//! +//! // 5. Cleanup +//! stf_ctx_finalize(ctx); +//! stf_task_destroy(task); +//! stf_logical_data_destroy(lX); +//! stf_logical_data_destroy(lY); +//! \endcode +//! +//! \warning This API is experimental and subject to change. +//! Define CCCL_C_EXPERIMENTAL to acknowledge this. + #pragma once #ifndef CCCL_C_EXPERIMENTAL @@ -18,45 +73,76 @@ #include #include -// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END #ifdef __cplusplus extern "C" { #endif +//! \defgroup AccessMode Data Access Modes +//! \brief Specifies how tasks access logical data +//! \{ + +//! \brief Data access mode for task dependencies +//! +//! Specifies how a task will access logical data, which determines +//! synchronization requirements and concurrency opportunities. typedef enum stf_access_mode { - STF_NONE = 0, - STF_READ = 1 << 0, - STF_WRITE = 1 << 1, - STF_RW = STF_READ | STF_WRITE + STF_NONE = 0, //!< No access (invalid) + STF_READ = 1 << 0, //!< Read-only access - allows concurrent readers + STF_WRITE = 1 << 1, //!< Write-only access - requires exclusive access + STF_RW = STF_READ | STF_WRITE //!< Read-write access - requires exclusive access } stf_access_mode; +//! \} + +//! \defgroup ExecPlace Execution Places +//! \brief Specify where tasks should execute +//! \{ + +//! \brief Device execution place configuration typedef struct stf_exec_place_device { - int dev_id; + int dev_id; //!< CUDA device ID (0-based) } stf_exec_place_device; +//! \brief Host execution place configuration typedef struct stf_exec_place_host { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ + char dummy; //!< Dummy field for standard C compatibility } stf_exec_place_host; +//! \brief Execution place type discriminator typedef enum stf_exec_place_kind { - STF_EXEC_PLACE_DEVICE, - STF_EXEC_PLACE_HOST + STF_EXEC_PLACE_DEVICE, //!< Task executes on CUDA device + STF_EXEC_PLACE_HOST //!< Task executes on host (CPU) } stf_exec_place_kind; +//! \brief Execution place specification +//! +//! Tagged union specifying where a task should execute. +//! Use helper functions make_device_place() and make_host_place() to create. typedef struct stf_exec_place { - enum stf_exec_place_kind kind; + enum stf_exec_place_kind kind; //!< Type of execution place union { - stf_exec_place_device device; - stf_exec_place_host host; - } u; + stf_exec_place_device device; //!< Device configuration (when kind == STF_EXEC_PLACE_DEVICE) + stf_exec_place_host host; //!< Host configuration (when kind == STF_EXEC_PLACE_HOST) + } u; //!< Configuration union } stf_exec_place; +//! \brief Create execution place for CUDA device +//! +//! \param dev_id CUDA device index (0-based) +//! \return Execution place configured for specified device +//! +//! \par Example: +//! \code +//! // Execute task on device 1 +//! stf_exec_place place = make_device_place(1); +//! stf_task_set_exec_place(task, &place); +//! \endcode static inline stf_exec_place make_device_place(int dev_id) { stf_exec_place p; @@ -65,6 +151,16 @@ static inline stf_exec_place make_device_place(int dev_id) return p; } +//! \brief Create execution place for host (CPU) +//! +//! \return Execution place configured for host execution +//! +//! \par Example: +//! \code +//! // Execute task on host +//! stf_exec_place place = make_host_place(); +//! stf_task_set_exec_place(task, &place); +//! \endcode static inline stf_exec_place make_host_place() { stf_exec_place p; @@ -73,46 +169,74 @@ static inline stf_exec_place make_host_place() return p; } +//! \} + +//! \defgroup DataPlace Data Places +//! \brief Specify where logical data should be located +//! \{ + +//! \brief Device data place configuration typedef struct stf_data_place_device { - int dev_id; + int dev_id; //!< CUDA device ID for data placement } stf_data_place_device; +//! \brief Host data place configuration typedef struct stf_data_place_host { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ + char dummy; //!< Dummy field for standard C compatibility } stf_data_place_host; +//! \brief Managed memory data place configuration typedef struct stf_data_place_managed { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ + char dummy; //!< Dummy field for standard C compatibility } stf_data_place_managed; +//! \brief Affine data place configuration +//! +//! Affine placement means data follows the execution location automatically. typedef struct stf_data_place_affine { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ + char dummy; //!< Dummy field for standard C compatibility } stf_data_place_affine; +//! \brief Data place type discriminator typedef enum stf_data_place_kind { - STF_DATA_PLACE_DEVICE, - STF_DATA_PLACE_HOST, - STF_DATA_PLACE_MANAGED, - STF_DATA_PLACE_AFFINE + STF_DATA_PLACE_DEVICE, //!< Data on specific device memory + STF_DATA_PLACE_HOST, //!< Data on host (CPU) memory + STF_DATA_PLACE_MANAGED, //!< Data in CUDA managed (unified) memory + STF_DATA_PLACE_AFFINE //!< Data follows execution place (default) } stf_data_place_kind; +//! \brief Data placement specification +//! +//! Tagged union specifying where logical data should be located. +//! Use helper functions to create (make_device_data_place(), etc.). typedef struct stf_data_place { - enum stf_data_place_kind kind; + enum stf_data_place_kind kind; //!< Type of data placement union { - stf_data_place_device device; - stf_data_place_host host; - stf_data_place_managed managed; - stf_data_place_affine affine; - } u; + stf_data_place_device device; //!< Device placement configuration + stf_data_place_host host; //!< Host placement configuration + stf_data_place_managed managed; //!< Managed memory configuration + stf_data_place_affine affine; //!< Affine placement configuration + } u; //!< Configuration union } stf_data_place; +//! \brief Create data place for specific CUDA device +//! +//! \param dev_id CUDA device index (0-based) +//! \return Data place configured for device memory +//! +//! \par Example: +//! \code +//! // Force data to device 1 even if task runs elsewhere +//! stf_data_place dplace = make_device_data_place(1); +//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace); +//! \endcode static inline stf_data_place make_device_data_place(int dev_id) { stf_data_place p; @@ -121,6 +245,16 @@ static inline stf_data_place make_device_data_place(int dev_id) return p; } +//! \brief Create data place for host memory +//! +//! \return Data place configured for host (CPU) memory +//! +//! \par Example: +//! \code +//! // Keep data on host even for device tasks (sparse access) +//! stf_data_place dplace = make_host_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace); +//! \endcode static inline struct stf_data_place make_host_data_place() { stf_data_place p; @@ -129,6 +263,18 @@ static inline struct stf_data_place make_host_data_place() return p; } +//! +//! \brief Create data place for CUDA managed memory +//! +//! \return Data place configured for managed (unified) memory +//! +//! \par Example: +//! \code +//! // Use managed memory for flexible access patterns +//! stf_data_place dplace = make_managed_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace); +//! \endcode + static inline struct stf_data_place make_managed_data_place() { stf_data_place p; @@ -137,6 +283,18 @@ static inline struct stf_data_place make_managed_data_place() return p; } +//! +//! \brief Create affine data place (follows execution location) +//! +//! \return Data place configured for affine placement (default behavior) +//! +//! \par Example: +//! \code +//! // Explicitly specify default behavior +//! stf_data_place dplace = make_affine_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace); +//! \endcode + static inline struct stf_data_place make_affine_data_place() { stf_data_place p; @@ -145,53 +303,708 @@ static inline struct stf_data_place make_affine_data_place() return p; } +//! \} + +//! \defgroup Handles Opaque Handles +//! \brief Opaque handle types for STF objects +//! \{ + +//! +//! \brief Opaque handle for STF context +//! +//! Context stores the state of the STF library and serves as entry point for all API calls. +//! Must be created with stf_ctx_create() or stf_ctx_create_graph() and destroyed with stf_ctx_finalize(). + typedef void* stf_ctx_handle; +//! +//! \brief Opaque handle for logical data +//! +//! Represents abstract data that may exist in multiple memory locations. +//! Created with stf_logical_data() or stf_logical_data_empty() and destroyed with stf_logical_data_destroy(). + +typedef void* stf_logical_data_handle; + +//! +//! \brief Opaque handle for task +//! +//! Represents a computational task that operates on logical data. +//! Created with stf_task_create() and destroyed with stf_task_destroy(). + +typedef void* stf_task_handle; + +//! +//! \brief Opaque handle for CUDA kernel task +//! +//! Specialized task optimized for CUDA kernel execution. +//! Created with stf_cuda_kernel_create() and destroyed with stf_cuda_kernel_destroy(). + +typedef void* stf_cuda_kernel_handle; + +//! \} + +//! \defgroup Context Context Management +//! \brief Create, configure, and finalize STF contexts +//! \{ + +//! +//! \brief Create STF context with stream backend +//! +//! Creates a new STF context using the default stream-based backend. +//! Tasks are executed eagerly using CUDA streams and events. +//! +//! \param[out] ctx Pointer to receive context handle +//! +//! \pre ctx must not be NULL +//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize() +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... use context ... +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_create_graph(), stf_ctx_finalize() + void stf_ctx_create(stf_ctx_handle* ctx); -// TODO stf_ctx_create_with_flags and an enum instead ? + +//! +//! \brief Create STF context with graph backend +//! +//! Creates a new STF context using the CUDA graph backend. +//! Tasks are captured into CUDA graphs and launched when needed, +//! potentially providing better performance for repeated patterns. +//! +//! \param[out] ctx Pointer to receive context handle +//! +//! \pre ctx must not be NULL +//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize() +//! +//! \note Graph backend has restrictions on stream synchronization within tasks +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create_graph(&ctx); +//! // ... use context ... +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_create(), stf_ctx_finalize() + void stf_ctx_create_graph(stf_ctx_handle* ctx); + +//! +//! \brief Finalize STF context +//! +//! Waits for all pending operations to complete, performs write-back +//! of modified data to host, and releases all associated resources. +//! +//! \param ctx Context handle to finalize +//! +//! \pre ctx must be valid context handle +//! \post All pending operations completed, resources released, ctx becomes invalid +//! +//! \note This function blocks until all asynchronous operations complete +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... submit tasks ... +//! stf_ctx_finalize(ctx); // Blocks until completion +//! \endcode +//! +//! \see stf_ctx_create(), stf_ctx_create_graph(), stf_fence() + void stf_ctx_finalize(stf_ctx_handle ctx); -// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ... -// TODO stf_ctx_is_graph() +//! +//! \brief Get synchronization fence for context +//! +//! Returns a CUDA stream that will be signaled when all pending +//! operations in the context complete. Provides non-blocking +//! alternative to stf_ctx_finalize() for synchronization queries. +//! +//! \param ctx Context handle +//! \return CUDA stream for synchronization +//! +//! \pre ctx must be valid context handle +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... submit tasks ... +//! +//! cudaStream_t fence = stf_fence(ctx); +//! cudaStreamSynchronize(fence); // Wait for completion +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_finalize() cudaStream_t stf_fence(stf_ctx_handle ctx); -typedef void* stf_logical_data_handle; +//! \} + +//! \defgroup LogicalData Logical Data Management +//! \brief Create and manage abstract data handles +//! \{ + +//! +//! \brief Create logical data from existing memory buffer +//! +//! Creates logical data handle from an existing host memory buffer. +//! STF takes ownership of data management during task execution. +//! +//! \param ctx Context handle +//! \param[out] ld Pointer to receive logical data handle +//! \param addr Pointer to existing data buffer +//! \param sz Size of data in bytes +//! +//! \pre ctx must be valid context handle +//! \pre ld must not be NULL +//! \pre addr must not be NULL +//! \pre sz must be greater than 0 +//! \post *ld contains valid logical data handle +//! +//! \note Original data pointer should not be accessed during task execution +//! \note Data will be written back when logical data is destroyed or context finalized +//! +//! \par Example: +//! \code +//! float data[1024]; +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, sizeof(data)); +//! // ... use in tasks ... +//! stf_logical_data_destroy(ld); +//! \endcode +//! +//! \see stf_logical_data_empty(), stf_logical_data_destroy() void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); + +//! +//! \brief Set symbolic name for logical data +//! +//! Associates a human-readable name with logical data for debugging +//! and task graph visualization. +//! +//! \param ld Logical data handle +//! \param symbol Null-terminated string name +//! +//! \pre ld must be valid logical data handle +//! \pre symbol must not be NULL +//! +//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set +//! +//! \par Example: +//! \code +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, size); +//! stf_logical_data_set_symbol(ld, "input_matrix"); +//! \endcode +//! +//! \see stf_task_set_symbol() + void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); + +//! +//! \brief Destroy logical data handle +//! +//! Destroys logical data handle and releases associated resources. +//! Triggers write-back to host if data was modified. +//! +//! \param ld Logical data handle to destroy +//! +//! \pre ld must be valid logical data handle +//! \post ld becomes invalid, resources released +//! +//! \note Must be called for every created logical data handle +//! +//! \par Example: +//! \code +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, size); +//! // ... use in tasks ... +//! stf_logical_data_destroy(ld); // Cleanup +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_empty() + void stf_logical_data_destroy(stf_logical_data_handle ld); + +//! +//! \brief Create empty logical data (temporary) +//! +//! Creates logical data of specified size without backing host memory. +//! Useful for temporary buffers in multi-stage computations. +//! +//! \param ctx Context handle +//! \param length Size in bytes +//! \param[out] to Pointer to receive logical data handle +//! +//! \pre ctx must be valid context handle +//! \pre length must be greater than 0 +//! \pre to must not be NULL +//! \post *to contains valid logical data handle +//! +//! \note First access must be write-only (STF_WRITE) +//! \note No write-back occurs since there's no host backing +//! +//! \par Example: +//! \code +//! stf_logical_data_handle temp; +//! stf_logical_data_empty(ctx, 1024 * sizeof(float), &temp); +//! +//! // First access must be write-only +//! stf_task_add_dep(task, temp, STF_WRITE); +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_destroy() + void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to); -// TODO -// void stf_logical_data_wait(stf_logical_data_handle ld); +//! +//! \brief Create synchronization token +//! +//! Creates a logical data handle for synchronization purposes only. +//! Contains no actual data but can be used to enforce execution order. +//! +//! \param ctx Context handle +//! \param[out] ld Pointer to receive token handle +//! +//! \pre ctx must be valid context handle +//! \pre ld must not be NULL +//! \post *ld contains valid token handle +//! +//! \note More efficient than using dummy data for synchronization +//! \note Can be accessed with any access mode +//! +//! \par Example: +//! \code +//! stf_logical_data_handle sync_token; +//! stf_token(ctx, &sync_token); +//! +//! // Task 1 signals completion +//! stf_task_add_dep(task1, sync_token, STF_WRITE); +//! +//! // Task 2 waits for task1 +//! stf_task_add_dep(task2, sync_token, STF_READ); +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_destroy() void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); -typedef void* stf_task_handle; +//! \} + +//! \defgroup TaskManagement Task Management +//! \brief Create, configure, and execute computational tasks +//! \{ + +//! +//! \brief Create new task +//! +//! Creates a new task within the specified context. Task is created +//! but not configured or executed. Use other stf_task_* functions +//! to configure execution place, add dependencies, and execute. +//! +//! \param ctx Context handle +//! \param[out] t Pointer to receive task handle +//! +//! \pre ctx must be valid context handle +//! \pre t must not be NULL +//! \post *t contains valid task handle +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! // ... configure task ... +//! stf_task_destroy(task); +//! \endcode +//! +//! \see stf_task_destroy(), stf_task_set_exec_place(), stf_task_add_dep() void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); + +//! +//! \brief Set task execution place +//! +//! Specifies where the task should execute (device or host). +//! If not called, defaults to current device. +//! +//! \param t Task handle +//! \param exec_p Pointer to execution place specification +//! +//! \pre t must be valid task handle +//! \pre exec_p must not be NULL +//! \pre Must be called before stf_task_start() +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! +//! // Execute on device 1 +//! stf_exec_place place = make_device_place(1); +//! stf_task_set_exec_place(task, &place); +//! \endcode +//! +//! \see make_device_place(), make_host_place() + void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); + +//! +//! \brief Set symbolic name for task +//! +//! Associates a human-readable name with task for debugging +//! and task graph visualization. +//! +//! \param t Task handle +//! \param symbol Null-terminated string name +//! +//! \pre t must be valid task handle +//! \pre symbol must not be NULL +//! +//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! stf_task_set_symbol(task, "matrix_multiply"); +//! \endcode +//! +//! \see stf_logical_data_set_symbol() + void stf_task_set_symbol(stf_task_handle t, const char* symbol); + +//! +//! \brief Add data dependency to task +//! +//! Adds a data dependency with specified access mode. Order of calls +//! determines index for stf_task_get(). Dependencies determine +//! automatic task synchronization. +//! +//! \param t Task handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! +//! \pre t must be valid task handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! +//! \par Example: +//! \code +//! stf_task_add_dep(task, input_data, STF_READ); // Index 0 +//! stf_task_add_dep(task, output_data, STF_WRITE); // Index 1 +//! stf_task_add_dep(task, temp_data, STF_RW); // Index 2 +//! \endcode +//! +//! \see stf_task_add_dep_with_dplace(), stf_task_get() + void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); + +//! +//! \brief Add data dependency with explicit data placement +//! +//! Adds data dependency with specified access mode and explicit +//! data placement. Overrides default affine placement. +//! +//! \param t Task handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! \param data_p Pointer to data place specification +//! +//! \pre t must be valid task handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! \pre data_p must not be NULL +//! +//! \par Example: +//! \code +//! // Force data to device 0 even if task runs elsewhere +//! stf_data_place dplace = make_device_data_place(0); +//! stf_task_add_dep_with_dplace(task, ld, STF_READ, &dplace); +//! \endcode +//! +//! \see stf_task_add_dep(), make_device_data_place(), make_host_data_place() + void stf_task_add_dep_with_dplace( stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p); + +//! +//! \brief Begin task execution +//! +//! Starts task execution. After this call, use stf_task_get_custream() +//! and stf_task_get() to access CUDA stream and data pointers. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! \pre Task dependencies must already be configured +//! \post Task is executing, stream and data available +//! +//! \par Example: +//! \code +//! // Configure task first +//! stf_task_add_dep(task, data, STF_RW); +//! +//! // Start execution +//! stf_task_start(task); +//! +//! // Now can access stream and data +//! CUstream stream = stf_task_get_custream(task); +//! float* ptr = (float*)stf_task_get(task, 0); +//! \endcode +//! +//! \see stf_task_end(), stf_task_get_custream(), stf_task_get() + void stf_task_start(stf_task_handle t); + +//! +//! \brief End task execution +//! +//! Ends task execution. Call after all CUDA operations are +//! submitted to the task stream. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! \post Task execution ended, may continue asynchronously +//! +//! \par Example: +//! \code +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! +//! // Launch operations +//! my_kernel<<>>(args...); +//! +//! stf_task_end(task); // Operations may still be running +//! \endcode +//! +//! \see stf_task_start() + void stf_task_end(stf_task_handle t); + +//! +//! \brief Get CUDA stream for task +//! +//! Returns CUDA stream associated with the task. All CUDA operations +//! within task must use this stream for proper synchronization. +//! +//! \param t Task handle +//! \return CUDA stream for launching operations +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! +//! \par Example: +//! \code +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! +//! // Launch kernel using this stream +//! my_kernel<<>>(args...); +//! \endcode +//! +//! \see stf_task_start(), stf_task_get() + CUstream stf_task_get_custream(stf_task_handle t); + +//! +//! \brief Get data pointer for task dependency +//! +//! Returns pointer to logical data instance for specified dependency. +//! Index corresponds to order of stf_task_add_dep() calls. +//! +//! \param t Task handle +//! \param submitted_index Dependency index (0-based) +//! \return Pointer to data (cast to appropriate type) +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! \pre submitted_index must be valid dependency index +//! \post Pointer valid until stf_task_end() +//! +//! \par Example: +//! \code +//! // Dependencies added in this order: +//! stf_task_add_dep(task, input, STF_READ); // Index 0 +//! stf_task_add_dep(task, output, STF_WRITE); // Index 1 +//! +//! stf_task_start(task); +//! +//! // Get data pointers +//! const float* in = (const float*)stf_task_get(task, 0); +//! float* out = (float*)stf_task_get(task, 1); +//! \endcode +//! +//! \see stf_task_add_dep(), stf_task_start() + void* stf_task_get(stf_task_handle t, int submitted_index); + +//! +//! \brief Destroy task handle +//! +//! Destroys task handle and releases associated resources. +//! Task should be completed before destruction. +//! +//! \param t Task handle to destroy +//! +//! \pre t must be valid task handle +//! \post t becomes invalid, resources released +//! +//! \note Must be called for every created task +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! // ... configure and execute task ... +//! stf_task_destroy(task); +//! \endcode +//! +//! \see stf_task_create() + void stf_task_destroy(stf_task_handle t); + +//! +//! \brief Enable graph capture for task (advanced) +//! +//! Enables graph capture optimization for the task. +//! Advanced feature typically not needed for basic usage. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! +//! \note Used internally for CUDA graph backend optimization + void stf_task_enable_capture(stf_task_handle t); -typedef void* stf_cuda_kernel_handle; +//! \} + +//! \defgroup CUDAKernel CUDA Kernel Interface +//! \brief Optimized interface for CUDA kernel execution +//! \{ + +//! +//! \brief Create CUDA kernel task +//! +//! Creates a specialized task optimized for CUDA kernel execution. +//! More efficient than generic tasks for repeated kernel launches, +//! especially with CUDA graph backend. +//! +//! \param ctx Context handle +//! \param[out] k Pointer to receive kernel handle +//! +//! \pre ctx must be valid context handle +//! \pre k must not be NULL +//! \post *k contains valid kernel handle +//! +//! \par Example: +//! \code +//! stf_cuda_kernel_handle kernel; +//! stf_cuda_kernel_create(ctx, &kernel); +//! // ... configure kernel ... +//! stf_cuda_kernel_destroy(kernel); +//! \endcode +//! +//! \see stf_cuda_kernel_destroy(), stf_task_create() void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); + +//! +//! \brief Set kernel execution place +//! +//! Specifies where the CUDA kernel should execute. +//! +//! \param k Kernel handle +//! \param exec_p Pointer to execution place specification +//! +//! \pre k must be valid kernel handle +//! \pre exec_p must not be NULL +//! +//! \see make_device_place(), stf_task_set_exec_place() + void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); + +//! +//! \brief Set symbolic name for kernel +//! +//! Associates human-readable name with kernel for debugging. +//! +//! \param k Kernel handle +//! \param symbol Null-terminated string name +//! +//! \pre k must be valid kernel handle +//! \pre symbol must not be NULL +//! +//! \see stf_task_set_symbol(), stf_logical_data_set_symbol() + void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); + +//! +//! \brief Add data dependency to kernel +//! +//! Adds data dependency with specified access mode for kernel execution. +//! +//! \param k Kernel handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! +//! \pre k must be valid kernel handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! +//! \see stf_task_add_dep() + void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); + +//! +//! \brief Start kernel execution +//! +//! Begins kernel execution phase. After this, add kernel descriptions +//! with stf_cuda_kernel_add_desc(). +//! +//! \param k Kernel handle +//! +//! \pre k must be valid kernel handle +//! \pre Dependencies must already be configured +//! +//! \see stf_cuda_kernel_add_desc(), stf_cuda_kernel_end() + void stf_cuda_kernel_start(stf_cuda_kernel_handle k); +//! +//! \brief Add CUDA kernel launch description (driver API) +//! +//! Adds kernel launch specification using CUDA driver API function handle. +//! This is the low-level interface used internally. +//! +//! \param k Kernel handle +//! \param cufunc CUDA driver API function handle +//! \param grid_dim_ CUDA grid dimensions +//! \param block_dim_ CUDA block dimensions +//! \param shared_mem_ Shared memory size in bytes +//! \param arg_cnt Number of kernel arguments +//! \param args Array of pointers to kernel arguments +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre cufunc must be valid CUfunction +//! \pre args must contain arg_cnt valid argument pointers +//! +//! \see stf_cuda_kernel_add_desc() + void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, @@ -201,8 +1014,46 @@ void stf_cuda_kernel_add_desc_cufunc( int arg_cnt, const void** args); -/* Convert CUDA kernel address to CUfunction because we may use them from a - * shared library where this would be invalid in the runtime API. */ +//! +//! \brief Add CUDA kernel launch description +//! +//! Adds kernel launch specification using runtime API function pointer. +//! Automatically converts to driver API internally. +//! +//! \param k Kernel handle +//! \param func Pointer to __global__ function +//! \param grid_dim_ CUDA grid dimensions +//! \param block_dim_ CUDA block dimensions +//! \param shared_mem_ Shared memory size in bytes +//! \param arg_cnt Number of kernel arguments +//! \param args Array of pointers to kernel arguments +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre func must be valid __global__ function pointer +//! \pre args must contain arg_cnt valid argument pointers +//! +//! \note Converts function pointer to CUfunction automatically +//! +//! \par Example: +//! \code +//! // Kernel: __global__ void axpy(float alpha, float* x, float* y) +//! stf_cuda_kernel_start(kernel); +//! +//! // Prepare arguments +//! float alpha = 2.0f; +//! float* d_x = (float*)stf_cuda_kernel_get_arg(kernel, 0); +//! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1); +//! const void* args[] = {&alpha, &d_x, &d_y}; +//! +//! // Launch kernel +//! stf_cuda_kernel_add_desc(kernel, (void*)axpy, +//! dim3(16), dim3(128), 0, 3, args); +//! stf_cuda_kernel_end(kernel); +//! \endcode +//! +//! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg() + static inline void stf_cuda_kernel_add_desc( stf_cuda_kernel_handle k, const void* func, @@ -219,9 +1070,56 @@ static inline void stf_cuda_kernel_add_desc( stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); } +//! +//! \brief Get kernel argument data pointer +//! +//! Returns pointer to logical data for use as kernel argument. +//! Index corresponds to order of stf_cuda_kernel_add_dep() calls. +//! +//! \param k Kernel handle +//! \param index Dependency index (0-based) +//! \return Pointer to data for kernel argument +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre index must be valid dependency index +//! +//! \see stf_cuda_kernel_add_desc(), stf_task_get() + void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index); + +//! +//! \brief End kernel execution +//! +//! Ends kernel execution phase. Call after all kernel descriptions +//! are added with stf_cuda_kernel_add_desc(). +//! +//! \param k Kernel handle +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! +//! \see stf_cuda_kernel_start() + void stf_cuda_kernel_end(stf_cuda_kernel_handle k); -void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t); + +//! +//! \brief Destroy kernel handle +//! +//! Destroys kernel handle and releases associated resources. +//! +//! \param k Kernel handle to destroy +//! +//! \pre k must be valid kernel handle +//! \post k becomes invalid, resources released +//! +//! \note Must be called for every created kernel +//! +//! \see stf_cuda_kernel_create() + +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle k); + +//! \} #ifdef __cplusplus } From 60266ff4f299a915eb565ec11287ab4a06fbb99d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 16:59:22 +0200 Subject: [PATCH 156/221] Better implementation of the help to convert C places to the C++ API, and define invalid values --- c/experimental/stf/src/stf.cu | 44 +++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index a2303a1f5a0..14a03de7f5a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -94,13 +94,20 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) /* Convert the C-API stf_exec_place to a C++ exec_place object */ exec_place to_exec_place(stf_exec_place* exec_p) { - if (exec_p->kind == STF_EXEC_PLACE_HOST) + assert(exec_p); + + switch (exec_p->kind) { - return exec_place::host(); - } + case STF_EXEC_PLACE_HOST: + return exec_place::host(); - assert(exec_p->kind == STF_EXEC_PLACE_DEVICE); - return exec_place::device(exec_p->u.device.dev_id); + case STF_EXEC_PLACE_DEVICE: + return exec_place::device(exec_p->u.device.dev_id); + + default: + assert(false && "Invalid execution place kind"); + return exec_place{}; // invalid exec_place + } } /* Convert the C-API stf_data_place to a C++ data_place object */ @@ -108,23 +115,24 @@ data_place to_data_place(stf_data_place* data_p) { assert(data_p); - if (data_p->kind == STF_DATA_PLACE_HOST) + switch (data_p->kind) { - return data_place::host(); - } + case STF_DATA_PLACE_HOST: + return data_place::host(); - if (data_p->kind == STF_DATA_PLACE_MANAGED) - { - return data_place::managed(); - } + case STF_DATA_PLACE_MANAGED: + return data_place::managed(); - if (data_p->kind == STF_DATA_PLACE_AFFINE) - { - return data_place::affine(); - } + case STF_DATA_PLACE_AFFINE: + return data_place::affine(); + + case STF_DATA_PLACE_DEVICE: + return data_place::device(data_p->u.device.dev_id); - assert(data_p->kind == STF_DATA_PLACE_DEVICE); - return data_place::device(data_p->u.device.dev_id); + default: + assert(false && "Invalid data place kind"); + return data_place::invalid(); // invalid data_place + } } void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) From 59f198304c5125765b9b0e097b54bb13aa027180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 17:04:15 +0200 Subject: [PATCH 157/221] Tell where to find cudax, and remove unnecessary libs --- c/experimental/stf/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 85f9bdb4c34..b44b0cbbcca 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -27,13 +27,16 @@ if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) ) endif() +find_package(cudax REQUIRED CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cudax/" +) + find_package(CUDAToolkit REQUIRED) set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cudart_static - CUDA::nvrtc - CUDA::nvJitLink CUDA::cuda_driver CCCL::cudax ) From 97dd6f7057597d22cccf3fd92d2107b218268e61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 21:40:26 +0200 Subject: [PATCH 158/221] CCCL_ENABLE_C enables c/parallel, CCCL_ENABLE_C_EXPERIMENTAL_STF enables c/experimental/stf/ --- CMakeLists.txt | 7 +++++++ CMakePresets.json | 6 +++++- c/CMakeLists.txt | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0830d733b32..034191f0d5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF) option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF) option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF) +option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") set(CCCL_ENABLE_BENCHMARKS OFF) @@ -86,6 +87,12 @@ if (CCCL_ENABLE_C) add_subdirectory(c) endif() +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + add_subdirectory(c/experimental/stf) +endif() + + + if (CCCL_ENABLE_TESTING) add_subdirectory(test) endif() diff --git a/CMakePresets.json b/CMakePresets.json index 3b86d9c813d..376f90d4f12 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -22,6 +22,7 @@ "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, "Thrust_ENABLE_INSTALL_RULES": true, @@ -64,6 +65,7 @@ "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, "CUB_ENABLE_TESTING": true, @@ -272,6 +274,7 @@ "inherits": "base", "cacheVariables": { "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "CCCL_C_Parallel_ENABLE_TESTING": true, "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false @@ -282,7 +285,8 @@ "displayName": "CCCL C CUDASTF Library", "inherits": "base", "cacheVariables": { - "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, "CCCL_C_Parallel_ENABLE_TESTING": false, "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false, "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 364494da7a0..af1cc1a4234 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1,2 +1,2 @@ add_subdirectory(parallel) -add_subdirectory(experimental/stf/) +# add_subdirectory(experimental/stf/) From 1610f0b20af2f11512af4dc8b8de9512f5b528a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 3 Sep 2025 22:03:56 +0200 Subject: [PATCH 159/221] Remove unnecessary definitions --- CMakePresets.json | 5 +---- c/experimental/stf/test/CMakeLists.txt | 7 ------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 376f90d4f12..a675873c21d 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -276,8 +276,7 @@ "CCCL_ENABLE_C": true, "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "CCCL_C_Parallel_ENABLE_TESTING": true, - "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, - "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true } }, { @@ -287,8 +286,6 @@ "cacheVariables": { "CCCL_ENABLE_C": false, "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, - "CCCL_C_Parallel_ENABLE_TESTING": false, - "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false, "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } }, diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt index f5613253a81..5776747de79 100644 --- a/c/experimental/stf/test/CMakeLists.txt +++ b/c/experimental/stf/test/CMakeLists.txt @@ -18,13 +18,6 @@ function(cccl_c_experimental_stf_add_test target_name_var source) CCCL::cudax ) - target_compile_definitions(${target_name} PRIVATE - TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust" - TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" - TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}" - ) - add_test(NAME ${target_name} COMMAND ${target_name}) endfunction() From 90a8d20661d3b2ba0acc2855a33b1c1ef054a791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Fri, 5 Sep 2025 21:37:55 +0200 Subject: [PATCH 160/221] use more consistent option names --- CMakeLists.txt | 8 ++------ CMakePresets.json | 10 +++++----- c/CMakeLists.txt | 9 +++++++-- python/cuda_cccl/CMakeLists.txt | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 034191f0d5c..5467357fabb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ option(CCCL_ENABLE_CUB "Enable the CUB developer build." OFF) option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF) option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF) -option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF) +option(CCCL_ENABLE_C_PARALLEL "Enable CUDA C Parallel Library." OFF) option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") @@ -83,14 +83,10 @@ if (CCCL_ENABLE_UNSTABLE) add_subdirectory(cudax) endif() -if (CCCL_ENABLE_C) +if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF) add_subdirectory(c) endif() -if (CCCL_ENABLE_C_EXPERIMENTAL_STF) - add_subdirectory(c/experimental/stf) -endif() - if (CCCL_ENABLE_TESTING) diff --git a/CMakePresets.json b/CMakePresets.json index a675873c21d..e4e012c60d2 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -21,7 +21,7 @@ "CCCL_ENABLE_CUDAX": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, - "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_PARALLEL": false, "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, @@ -64,7 +64,7 @@ "CCCL_ENABLE_TESTING": true, "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, - "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_PARALLEL": true, "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, @@ -273,7 +273,7 @@ "displayName": "CCCL C Parallel Library", "inherits": "base", "cacheVariables": { - "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_PARALLEL": true, "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "CCCL_C_Parallel_ENABLE_TESTING": true, "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true @@ -284,7 +284,7 @@ "displayName": "CCCL C CUDASTF Library", "inherits": "base", "cacheVariables": { - "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_PARALLEL": false, "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } @@ -310,7 +310,7 @@ "CCCL_ENABLE_THRUST": false, "CCCL_ENABLE_LIBCUDACXX": false, "CCCL_ENABLE_CUDAX": false, - "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_PARALLEL": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "CUB_ENABLE_EXAMPLES": false, diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index af1cc1a4234..f0a1826d519 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1,2 +1,7 @@ -add_subdirectory(parallel) -# add_subdirectory(experimental/stf/) +if (CCCL_ENABLE_C_PARALLEL) + add_subdirectory(parallel) +endif() + +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + add_subdirectory(experimental/stf) +endif() diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 5ed8aaa9e46..f241d948d22 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,7 +25,7 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() # Build and install C++ library first -set(CCCL_ENABLE_C ON) +set(CCCL_ENABLE_C_PARALLEL ON) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) add_subdirectory(${_cccl_root} _parent_cccl) From ac667ca25962f001d2735d2117f9993ca5dfe388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 09:46:39 +0200 Subject: [PATCH 161/221] Do not use [[maybe_unused]] for the C lib header because this is only available from C23 --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 7e012830408..d4f44cdbf96 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1064,8 +1064,9 @@ static inline void stf_cuda_kernel_add_desc( const void** args) { CUfunction cufunc; - [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); + cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); assert(res == cudaSuccess); + (void) res; /* suppress unused variable warning in release builds */ stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); } From 5bf62b365d92506b0ffc41f254615578b87d07b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 10:01:45 +0200 Subject: [PATCH 162/221] Return an error code in stf_cuda_kernel_add_desc rather than use assertions --- .../stf/include/cccl/c/experimental/stf/stf.h | 17 ++++++++++------- c/experimental/stf/test/test_cuda_kernel.cu | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index d4f44cdbf96..2c5515aaa0f 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -69,7 +69,6 @@ # error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." #endif // !CCCL_C_EXPERIMENTAL -#include #include #include @@ -1028,6 +1027,8 @@ void stf_cuda_kernel_add_desc_cufunc( //! \param arg_cnt Number of kernel arguments //! \param args Array of pointers to kernel arguments //! +//! \return cudaSuccess on success, or appropriate cudaError_t on failure +//! //! \pre k must be valid kernel handle //! \pre stf_cuda_kernel_start() must have been called //! \pre func must be valid __global__ function pointer @@ -1046,15 +1047,15 @@ void stf_cuda_kernel_add_desc_cufunc( //! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1); //! const void* args[] = {&alpha, &d_x, &d_y}; //! -//! // Launch kernel -//! stf_cuda_kernel_add_desc(kernel, (void*)axpy, -//! dim3(16), dim3(128), 0, 3, args); +//! // Launch kernel (caller must handle return values != cudaSuccess) +//! cudaError_t err = stf_cuda_kernel_add_desc(kernel, (void*)axpy, +//! dim3(16), dim3(128), 0, 3, args); //! stf_cuda_kernel_end(kernel); //! \endcode //! //! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg() -static inline void stf_cuda_kernel_add_desc( +static inline cudaError_t stf_cuda_kernel_add_desc( stf_cuda_kernel_handle k, const void* func, dim3 grid_dim_, @@ -1065,10 +1066,12 @@ static inline void stf_cuda_kernel_add_desc( { CUfunction cufunc; cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); - assert(res == cudaSuccess); - (void) res; /* suppress unused variable warning in release builds */ + if (res != cudaSuccess) { + return res; + } stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); + return cudaSuccess; } //! diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index b5ba66b0f3a..12dc178d061 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -70,7 +70,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); const void* args[4] = {&N, &alpha, &dX, &dY}; - stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + REQUIRE(err == cudaSuccess); stf_cuda_kernel_end(k); stf_cuda_kernel_destroy(k); From c0a54f1ab5d19eda56db4bbcc9d82decd586f3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 10:02:16 +0200 Subject: [PATCH 163/221] clang-format --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 3 ++- c/experimental/stf/test/test_cuda_kernel.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 2c5515aaa0f..903b71cd878 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1066,7 +1066,8 @@ static inline cudaError_t stf_cuda_kernel_add_desc( { CUfunction cufunc; cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); - if (res != cudaSuccess) { + if (res != cudaSuccess) + { return res; } diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index 12dc178d061..05c0e7e8620 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -70,7 +70,7 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); const void* args[4] = {&N, &alpha, &dX, &dY}; - cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); REQUIRE(err == cudaSuccess); stf_cuda_kernel_end(k); stf_cuda_kernel_destroy(k); From af43da5de5187f5373f20172f2fb19c787c2cead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:03:03 +0200 Subject: [PATCH 164/221] Merge stf_c_lib: Update c/ directory with complete C library implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace c/ directory contents with complete stf_c_lib version - Update stf.cu (297→350 lines) with proper struct definitions and cleaner implementation - Fix Python linting issues (E711, E402, F841) in STF test files - Ensure pre-commit hooks pass Now stf_c_api has both complete C library and Python bindings. --- c/CMakeLists.txt | 9 +- c/experimental/stf/CMakeLists.txt | 14 +- .../stf/include/cccl/c/experimental/stf/stf.h | 1083 +++++++++++++++-- c/experimental/stf/src/stf.cu | 229 ++-- c/experimental/stf/test/CMakeLists.txt | 7 - c/experimental/stf/test/test_cuda_kernel.cu | 3 +- .../cuda/cccl/experimental/stf/decorator.py | 2 +- python/cuda_cccl/tests/stf/test_decorator.py | 2 +- python/cuda_cccl/tests/stf/test_fhe.py | 3 +- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 3 +- python/cuda_cccl/tests/stf/test_numba.py | 2 +- python/cuda_cccl/tests/stf/test_pytorch.py | 6 +- .../tests/stf/test_stencil_decorator.py | 2 +- 13 files changed, 1160 insertions(+), 205 deletions(-) diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 364494da7a0..f0a1826d519 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -1,2 +1,7 @@ -add_subdirectory(parallel) -add_subdirectory(experimental/stf/) +if (CCCL_ENABLE_C_PARALLEL) + add_subdirectory(parallel) +endif() + +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + add_subdirectory(experimental/stf) +endif() diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt index 11599edaec7..b44b0cbbcca 100644 --- a/c/experimental/stf/CMakeLists.txt +++ b/c/experimental/stf/CMakeLists.txt @@ -27,12 +27,16 @@ if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY) ) endif() +find_package(cudax REQUIRED CONFIG + NO_DEFAULT_PATH # Only check the explicit path in HINTS: + HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cudax/" +) + find_package(CUDAToolkit REQUIRED) set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) +target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1) target_link_libraries(cccl.c.experimental.stf PRIVATE CUDA::cudart_static - CUDA::nvrtc - CUDA::nvJitLink CUDA::cuda_driver CCCL::cudax ) @@ -43,10 +47,6 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +//! \file stf.h +//! \brief CUDA STF (Sequential Task Flow) C Interface +//! +//! \details +//! This header provides a C interface to the CUDA STF C++ library, enabling +//! task-based parallel programming with automatic data movement and dependency management. +//! +//! The Sequential Task Flow programming model involves defining logical data and +//! submitting tasks that operate on this data. STF automatically deduces dependencies +//! between tasks and orchestrates both computation and data movement to ensure +//! efficient execution with maximum concurrency. +//! +//! \par Key Concepts: +//! - **Logical Data**: Abstract handles for data that may exist in multiple locations +//! - **Tasks**: Operations that consume and produce logical data with specified access modes +//! - **Dependencies**: Automatically inferred from data access patterns (RAW, WAR, WAW) +//! - **Execution Places**: Specify where tasks run (CPU, specific GPU devices) +//! - **Data Places**: Specify where data should be located in memory hierarchy +//! +//! \par Basic Usage Pattern: +//! \code +//! // 1. Create STF context +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! +//! // 2. Create logical data from arrays +//! float X[1024], Y[1024]; +//! stf_logical_data_handle lX, lY; +//! stf_logical_data(ctx, &lX, X, sizeof(X)); +//! stf_logical_data(ctx, &lY, Y, sizeof(Y)); +//! +//! // 3. Create and configure task +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! stf_task_add_dep(task, lX, STF_READ); // X is read-only +//! stf_task_add_dep(task, lY, STF_RW); // Y is read-write +//! +//! // 4. Execute task +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! float* x_ptr = (float*)stf_task_get(task, 0); +//! float* y_ptr = (float*)stf_task_get(task, 1); +//! // ... launch CUDA operations using stream ... +//! stf_task_end(task); +//! +//! // 5. Cleanup +//! stf_ctx_finalize(ctx); +//! stf_task_destroy(task); +//! stf_logical_data_destroy(lX); +//! stf_logical_data_destroy(lY); +//! \endcode +//! +//! \warning This API is experimental and subject to change. +//! Define CCCL_C_EXPERIMENTAL to acknowledge this. + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice." +#endif // !CCCL_C_EXPERIMENTAL + #include #include -// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END #ifdef __cplusplus extern "C" { #endif +//! \defgroup AccessMode Data Access Modes +//! \brief Specifies how tasks access logical data +//! \{ + +//! \brief Data access mode for task dependencies +//! +//! Specifies how a task will access logical data, which determines +//! synchronization requirements and concurrency opportunities. typedef enum stf_access_mode { - STF_NONE = 0, - STF_READ = 1 << 0, - STF_WRITE = 1 << 1, - STF_RW = STF_READ | STF_WRITE + STF_NONE = 0, //!< No access (invalid) + STF_READ = 1 << 0, //!< Read-only access - allows concurrent readers + STF_WRITE = 1 << 1, //!< Write-only access - requires exclusive access + STF_RW = STF_READ | STF_WRITE //!< Read-write access - requires exclusive access } stf_access_mode; -struct stf_exec_place_device +//! \} + +//! \defgroup ExecPlace Execution Places +//! \brief Specify where tasks should execute +//! \{ + +//! \brief Device execution place configuration +typedef struct stf_exec_place_device { - int dev_id; -}; + int dev_id; //!< CUDA device ID (0-based) +} stf_exec_place_device; -struct stf_exec_place_host +//! \brief Host execution place configuration +typedef struct stf_exec_place_host { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; + char dummy; //!< Dummy field for standard C compatibility +} stf_exec_place_host; +//! \brief Execution place type discriminator typedef enum stf_exec_place_kind { - STF_EXEC_PLACE_DEVICE, - STF_EXEC_PLACE_HOST + STF_EXEC_PLACE_DEVICE, //!< Task executes on CUDA device + STF_EXEC_PLACE_HOST //!< Task executes on host (CPU) } stf_exec_place_kind; -struct stf_exec_place +//! \brief Execution place specification +//! +//! Tagged union specifying where a task should execute. +//! Use helper functions make_device_place() and make_host_place() to create. +typedef struct stf_exec_place { - enum stf_exec_place_kind kind; + enum stf_exec_place_kind kind; //!< Type of execution place union { - struct stf_exec_place_device device; - struct stf_exec_place_host host; - } u; -}; + stf_exec_place_device device; //!< Device configuration (when kind == STF_EXEC_PLACE_DEVICE) + stf_exec_place_host host; //!< Host configuration (when kind == STF_EXEC_PLACE_HOST) + } u; //!< Configuration union +} stf_exec_place; -static inline struct stf_exec_place make_device_place(int dev_id) +//! \brief Create execution place for CUDA device +//! +//! \param dev_id CUDA device index (0-based) +//! \return Execution place configured for specified device +//! +//! \par Example: +//! \code +//! // Execute task on device 1 +//! stf_exec_place place = make_device_place(1); +//! stf_task_set_exec_place(task, &place); +//! \endcode +static inline stf_exec_place make_device_place(int dev_id) { - struct stf_exec_place p; + stf_exec_place p; p.kind = STF_EXEC_PLACE_DEVICE; p.u.device.dev_id = dev_id; return p; } -static inline struct stf_exec_place make_host_place() +//! \brief Create execution place for host (CPU) +//! +//! \return Execution place configured for host execution +//! +//! \par Example: +//! \code +//! // Execute task on host +//! stf_exec_place place = make_host_place(); +//! stf_task_set_exec_place(task, &place); +//! \endcode +static inline stf_exec_place make_host_place() { - struct stf_exec_place p; + stf_exec_place p; p.kind = STF_EXEC_PLACE_HOST; p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } -typedef struct stf_exec_place_device stf_exec_place_device; -typedef struct stf_exec_place_host stf_exec_place_host; -typedef union stf_exec_place_u stf_exec_place_u; -typedef struct stf_exec_place stf_exec_place; +//! \} + +//! \defgroup DataPlace Data Places +//! \brief Specify where logical data should be located +//! \{ -struct stf_data_place_device +//! \brief Device data place configuration +typedef struct stf_data_place_device { - int dev_id; -}; + int dev_id; //!< CUDA device ID for data placement +} stf_data_place_device; -struct stf_data_place_host +//! \brief Host data place configuration +typedef struct stf_data_place_host { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; + char dummy; //!< Dummy field for standard C compatibility +} stf_data_place_host; -struct stf_data_place_managed +//! \brief Managed memory data place configuration +typedef struct stf_data_place_managed { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; + char dummy; //!< Dummy field for standard C compatibility +} stf_data_place_managed; -struct stf_data_place_affine +//! \brief Affine data place configuration +//! +//! Affine placement means data follows the execution location automatically. +typedef struct stf_data_place_affine { - char dummy; /* dummy to keep it standard C which does not allow empty structs */ -}; + char dummy; //!< Dummy field for standard C compatibility +} stf_data_place_affine; +//! \brief Data place type discriminator typedef enum stf_data_place_kind { - STF_DATA_PLACE_DEVICE, - STF_DATA_PLACE_HOST, - STF_DATA_PLACE_MANAGED, - STF_DATA_PLACE_AFFINE + STF_DATA_PLACE_DEVICE, //!< Data on specific device memory + STF_DATA_PLACE_HOST, //!< Data on host (CPU) memory + STF_DATA_PLACE_MANAGED, //!< Data in CUDA managed (unified) memory + STF_DATA_PLACE_AFFINE //!< Data follows execution place (default) } stf_data_place_kind; -struct stf_data_place +//! \brief Data placement specification +//! +//! Tagged union specifying where logical data should be located. +//! Use helper functions to create (make_device_data_place(), etc.). +typedef struct stf_data_place { - enum stf_data_place_kind kind; + enum stf_data_place_kind kind; //!< Type of data placement union { - struct stf_data_place_device device; - struct stf_data_place_host host; - struct stf_data_place_managed managed; - struct stf_data_place_affine affine; - } u; -}; + stf_data_place_device device; //!< Device placement configuration + stf_data_place_host host; //!< Host placement configuration + stf_data_place_managed managed; //!< Managed memory configuration + stf_data_place_affine affine; //!< Affine placement configuration + } u; //!< Configuration union +} stf_data_place; -static inline struct stf_data_place make_device_data_place(int dev_id) +//! \brief Create data place for specific CUDA device +//! +//! \param dev_id CUDA device index (0-based) +//! \return Data place configured for device memory +//! +//! \par Example: +//! \code +//! // Force data to device 1 even if task runs elsewhere +//! stf_data_place dplace = make_device_data_place(1); +//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace); +//! \endcode +static inline stf_data_place make_device_data_place(int dev_id) { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_DEVICE; p.u.device.dev_id = dev_id; return p; } +//! \brief Create data place for host memory +//! +//! \return Data place configured for host (CPU) memory +//! +//! \par Example: +//! \code +//! // Keep data on host even for device tasks (sparse access) +//! stf_data_place dplace = make_host_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace); +//! \endcode static inline struct stf_data_place make_host_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_HOST; p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } +//! +//! \brief Create data place for CUDA managed memory +//! +//! \return Data place configured for managed (unified) memory +//! +//! \par Example: +//! \code +//! // Use managed memory for flexible access patterns +//! stf_data_place dplace = make_managed_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace); +//! \endcode + static inline struct stf_data_place make_managed_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_MANAGED; p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } +//! +//! \brief Create affine data place (follows execution location) +//! +//! \return Data place configured for affine placement (default behavior) +//! +//! \par Example: +//! \code +//! // Explicitly specify default behavior +//! stf_data_place dplace = make_affine_data_place(); +//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace); +//! \endcode + static inline struct stf_data_place make_affine_data_place() { - struct stf_data_place p; + stf_data_place p; p.kind = STF_DATA_PLACE_AFFINE; p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */ return p; } -typedef struct stf_data_place_device stf_data_place_device; -typedef struct stf_data_place_host stf_data_place_host; -typedef struct stf_data_place_managed stf_data_place_managed; -typedef struct stf_data_place_affine stf_data_place_affine; -typedef union stf_data_place_u stf_data_place_u; -typedef struct stf_data_place stf_data_place; +//! \} + +//! \defgroup Handles Opaque Handles +//! \brief Opaque handle types for STF objects +//! \{ + +//! +//! \brief Opaque handle for STF context +//! +//! Context stores the state of the STF library and serves as entry point for all API calls. +//! Must be created with stf_ctx_create() or stf_ctx_create_graph() and destroyed with stf_ctx_finalize(). + +typedef void* stf_ctx_handle; + +//! +//! \brief Opaque handle for logical data +//! +//! Represents abstract data that may exist in multiple memory locations. +//! Created with stf_logical_data() or stf_logical_data_empty() and destroyed with stf_logical_data_destroy(). + +typedef void* stf_logical_data_handle; + +//! +//! \brief Opaque handle for task +//! +//! Represents a computational task that operates on logical data. +//! Created with stf_task_create() and destroyed with stf_task_destroy(). + +typedef void* stf_task_handle; + +//! +//! \brief Opaque handle for CUDA kernel task +//! +//! Specialized task optimized for CUDA kernel execution. +//! Created with stf_cuda_kernel_create() and destroyed with stf_cuda_kernel_destroy(). + +typedef void* stf_cuda_kernel_handle; -typedef struct stf_ctx_handle_t* stf_ctx_handle; +//! \} + +//! \defgroup Context Context Management +//! \brief Create, configure, and finalize STF contexts +//! \{ + +//! +//! \brief Create STF context with stream backend +//! +//! Creates a new STF context using the default stream-based backend. +//! Tasks are executed eagerly using CUDA streams and events. +//! +//! \param[out] ctx Pointer to receive context handle +//! +//! \pre ctx must not be NULL +//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize() +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... use context ... +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_create_graph(), stf_ctx_finalize() void stf_ctx_create(stf_ctx_handle* ctx); -// TODO stf_ctx_create_with_flags and an enum instead ? + +//! +//! \brief Create STF context with graph backend +//! +//! Creates a new STF context using the CUDA graph backend. +//! Tasks are captured into CUDA graphs and launched when needed, +//! potentially providing better performance for repeated patterns. +//! +//! \param[out] ctx Pointer to receive context handle +//! +//! \pre ctx must not be NULL +//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize() +//! +//! \note Graph backend has restrictions on stream synchronization within tasks +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create_graph(&ctx); +//! // ... use context ... +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_create(), stf_ctx_finalize() + void stf_ctx_create_graph(stf_ctx_handle* ctx); + +//! +//! \brief Finalize STF context +//! +//! Waits for all pending operations to complete, performs write-back +//! of modified data to host, and releases all associated resources. +//! +//! \param ctx Context handle to finalize +//! +//! \pre ctx must be valid context handle +//! \post All pending operations completed, resources released, ctx becomes invalid +//! +//! \note This function blocks until all asynchronous operations complete +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... submit tasks ... +//! stf_ctx_finalize(ctx); // Blocks until completion +//! \endcode +//! +//! \see stf_ctx_create(), stf_ctx_create_graph(), stf_fence() + void stf_ctx_finalize(stf_ctx_handle ctx); -// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ... -// TODO stf_ctx_is_graph() +//! +//! \brief Get synchronization fence for context +//! +//! Returns a CUDA stream that will be signaled when all pending +//! operations in the context complete. Provides non-blocking +//! alternative to stf_ctx_finalize() for synchronization queries. +//! +//! \param ctx Context handle +//! \return CUDA stream for synchronization +//! +//! \pre ctx must be valid context handle +//! +//! \par Example: +//! \code +//! stf_ctx_handle ctx; +//! stf_ctx_create(&ctx); +//! // ... submit tasks ... +//! +//! cudaStream_t fence = stf_fence(ctx); +//! cudaStreamSynchronize(fence); // Wait for completion +//! stf_ctx_finalize(ctx); +//! \endcode +//! +//! \see stf_ctx_finalize() cudaStream_t stf_fence(stf_ctx_handle ctx); -typedef struct stf_logical_data_handle_t* stf_logical_data_handle; +//! \} + +//! \defgroup LogicalData Logical Data Management +//! \brief Create and manage abstract data handles +//! \{ + +//! +//! \brief Create logical data from existing memory buffer +//! +//! Creates logical data handle from an existing host memory buffer. +//! STF takes ownership of data management during task execution. +//! +//! \param ctx Context handle +//! \param[out] ld Pointer to receive logical data handle +//! \param addr Pointer to existing data buffer +//! \param sz Size of data in bytes +//! +//! \pre ctx must be valid context handle +//! \pre ld must not be NULL +//! \pre addr must not be NULL +//! \pre sz must be greater than 0 +//! \post *ld contains valid logical data handle +//! +//! \note Original data pointer should not be accessed during task execution +//! \note Data will be written back when logical data is destroyed or context finalized +//! +//! \par Example: +//! \code +//! float data[1024]; +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, sizeof(data)); +//! // ... use in tasks ... +//! stf_logical_data_destroy(ld); +//! \endcode +//! +//! \see stf_logical_data_empty(), stf_logical_data_destroy() void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); + +//! +//! \brief Set symbolic name for logical data +//! +//! Associates a human-readable name with logical data for debugging +//! and task graph visualization. +//! +//! \param ld Logical data handle +//! \param symbol Null-terminated string name +//! +//! \pre ld must be valid logical data handle +//! \pre symbol must not be NULL +//! +//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set +//! +//! \par Example: +//! \code +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, size); +//! stf_logical_data_set_symbol(ld, "input_matrix"); +//! \endcode +//! +//! \see stf_task_set_symbol() + void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol); + +//! +//! \brief Destroy logical data handle +//! +//! Destroys logical data handle and releases associated resources. +//! Triggers write-back to host if data was modified. +//! +//! \param ld Logical data handle to destroy +//! +//! \pre ld must be valid logical data handle +//! \post ld becomes invalid, resources released +//! +//! \note Must be called for every created logical data handle +//! +//! \par Example: +//! \code +//! stf_logical_data_handle ld; +//! stf_logical_data(ctx, &ld, data, size); +//! // ... use in tasks ... +//! stf_logical_data_destroy(ld); // Cleanup +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_empty() + void stf_logical_data_destroy(stf_logical_data_handle ld); + +//! +//! \brief Create empty logical data (temporary) +//! +//! Creates logical data of specified size without backing host memory. +//! Useful for temporary buffers in multi-stage computations. +//! +//! \param ctx Context handle +//! \param length Size in bytes +//! \param[out] to Pointer to receive logical data handle +//! +//! \pre ctx must be valid context handle +//! \pre length must be greater than 0 +//! \pre to must not be NULL +//! \post *to contains valid logical data handle +//! +//! \note First access must be write-only (STF_WRITE) +//! \note No write-back occurs since there's no host backing +//! +//! \par Example: +//! \code +//! stf_logical_data_handle temp; +//! stf_logical_data_empty(ctx, 1024 * sizeof(float), &temp); +//! +//! // First access must be write-only +//! stf_task_add_dep(task, temp, STF_WRITE); +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_destroy() + void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to); -// TODO -// void stf_logical_data_wait(stf_logical_data_handle ld); +//! +//! \brief Create synchronization token +//! +//! Creates a logical data handle for synchronization purposes only. +//! Contains no actual data but can be used to enforce execution order. +//! +//! \param ctx Context handle +//! \param[out] ld Pointer to receive token handle +//! +//! \pre ctx must be valid context handle +//! \pre ld must not be NULL +//! \post *ld contains valid token handle +//! +//! \note More efficient than using dummy data for synchronization +//! \note Can be accessed with any access mode +//! +//! \par Example: +//! \code +//! stf_logical_data_handle sync_token; +//! stf_token(ctx, &sync_token); +//! +//! // Task 1 signals completion +//! stf_task_add_dep(task1, sync_token, STF_WRITE); +//! +//! // Task 2 waits for task1 +//! stf_task_add_dep(task2, sync_token, STF_READ); +//! \endcode +//! +//! \see stf_logical_data(), stf_logical_data_destroy() void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); -typedef struct stf_task_handle_t* stf_task_handle; +//! \} + +//! \defgroup TaskManagement Task Management +//! \brief Create, configure, and execute computational tasks +//! \{ + +//! +//! \brief Create new task +//! +//! Creates a new task within the specified context. Task is created +//! but not configured or executed. Use other stf_task_* functions +//! to configure execution place, add dependencies, and execute. +//! +//! \param ctx Context handle +//! \param[out] t Pointer to receive task handle +//! +//! \pre ctx must be valid context handle +//! \pre t must not be NULL +//! \post *t contains valid task handle +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! // ... configure task ... +//! stf_task_destroy(task); +//! \endcode +//! +//! \see stf_task_destroy(), stf_task_set_exec_place(), stf_task_add_dep() void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t); -void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p); + +//! +//! \brief Set task execution place +//! +//! Specifies where the task should execute (device or host). +//! If not called, defaults to current device. +//! +//! \param t Task handle +//! \param exec_p Pointer to execution place specification +//! +//! \pre t must be valid task handle +//! \pre exec_p must not be NULL +//! \pre Must be called before stf_task_start() +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! +//! // Execute on device 1 +//! stf_exec_place place = make_device_place(1); +//! stf_task_set_exec_place(task, &place); +//! \endcode +//! +//! \see make_device_place(), make_host_place() + +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p); + +//! +//! \brief Set symbolic name for task +//! +//! Associates a human-readable name with task for debugging +//! and task graph visualization. +//! +//! \param t Task handle +//! \param symbol Null-terminated string name +//! +//! \pre t must be valid task handle +//! \pre symbol must not be NULL +//! +//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! stf_task_set_symbol(task, "matrix_multiply"); +//! \endcode +//! +//! \see stf_logical_data_set_symbol() + void stf_task_set_symbol(stf_task_handle t, const char* symbol); + +//! +//! \brief Add data dependency to task +//! +//! Adds a data dependency with specified access mode. Order of calls +//! determines index for stf_task_get(). Dependencies determine +//! automatic task synchronization. +//! +//! \param t Task handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! +//! \pre t must be valid task handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! +//! \par Example: +//! \code +//! stf_task_add_dep(task, input_data, STF_READ); // Index 0 +//! stf_task_add_dep(task, output_data, STF_WRITE); // Index 1 +//! stf_task_add_dep(task, temp_data, STF_RW); // Index 2 +//! \endcode +//! +//! \see stf_task_add_dep_with_dplace(), stf_task_get() + void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m); + +//! +//! \brief Add data dependency with explicit data placement +//! +//! Adds data dependency with specified access mode and explicit +//! data placement. Overrides default affine placement. +//! +//! \param t Task handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! \param data_p Pointer to data place specification +//! +//! \pre t must be valid task handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! \pre data_p must not be NULL +//! +//! \par Example: +//! \code +//! // Force data to device 0 even if task runs elsewhere +//! stf_data_place dplace = make_device_data_place(0); +//! stf_task_add_dep_with_dplace(task, ld, STF_READ, &dplace); +//! \endcode +//! +//! \see stf_task_add_dep(), make_device_data_place(), make_host_data_place() + void stf_task_add_dep_with_dplace( - stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p); + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p); + +//! +//! \brief Begin task execution +//! +//! Starts task execution. After this call, use stf_task_get_custream() +//! and stf_task_get() to access CUDA stream and data pointers. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! \pre Task dependencies must already be configured +//! \post Task is executing, stream and data available +//! +//! \par Example: +//! \code +//! // Configure task first +//! stf_task_add_dep(task, data, STF_RW); +//! +//! // Start execution +//! stf_task_start(task); +//! +//! // Now can access stream and data +//! CUstream stream = stf_task_get_custream(task); +//! float* ptr = (float*)stf_task_get(task, 0); +//! \endcode +//! +//! \see stf_task_end(), stf_task_get_custream(), stf_task_get() + void stf_task_start(stf_task_handle t); + +//! +//! \brief End task execution +//! +//! Ends task execution. Call after all CUDA operations are +//! submitted to the task stream. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! \post Task execution ended, may continue asynchronously +//! +//! \par Example: +//! \code +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! +//! // Launch operations +//! my_kernel<<>>(args...); +//! +//! stf_task_end(task); // Operations may still be running +//! \endcode +//! +//! \see stf_task_start() + void stf_task_end(stf_task_handle t); + +//! +//! \brief Get CUDA stream for task +//! +//! Returns CUDA stream associated with the task. All CUDA operations +//! within task must use this stream for proper synchronization. +//! +//! \param t Task handle +//! \return CUDA stream for launching operations +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! +//! \par Example: +//! \code +//! stf_task_start(task); +//! CUstream stream = stf_task_get_custream(task); +//! +//! // Launch kernel using this stream +//! my_kernel<<>>(args...); +//! \endcode +//! +//! \see stf_task_start(), stf_task_get() + CUstream stf_task_get_custream(stf_task_handle t); + +//! +//! \brief Get data pointer for task dependency +//! +//! Returns pointer to logical data instance for specified dependency. +//! Index corresponds to order of stf_task_add_dep() calls. +//! +//! \param t Task handle +//! \param submitted_index Dependency index (0-based) +//! \return Pointer to data (cast to appropriate type) +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! \pre submitted_index must be valid dependency index +//! \post Pointer valid until stf_task_end() +//! +//! \par Example: +//! \code +//! // Dependencies added in this order: +//! stf_task_add_dep(task, input, STF_READ); // Index 0 +//! stf_task_add_dep(task, output, STF_WRITE); // Index 1 +//! +//! stf_task_start(task); +//! +//! // Get data pointers +//! const float* in = (const float*)stf_task_get(task, 0); +//! float* out = (float*)stf_task_get(task, 1); +//! \endcode +//! +//! \see stf_task_add_dep(), stf_task_start() + void* stf_task_get(stf_task_handle t, int submitted_index); + +//! +//! \brief Destroy task handle +//! +//! Destroys task handle and releases associated resources. +//! Task should be completed before destruction. +//! +//! \param t Task handle to destroy +//! +//! \pre t must be valid task handle +//! \post t becomes invalid, resources released +//! +//! \note Must be called for every created task +//! +//! \par Example: +//! \code +//! stf_task_handle task; +//! stf_task_create(ctx, &task); +//! // ... configure and execute task ... +//! stf_task_destroy(task); +//! \endcode +//! +//! \see stf_task_create() + void stf_task_destroy(stf_task_handle t); + +//! +//! \brief Enable graph capture for task (advanced) +//! +//! Enables graph capture optimization for the task. +//! Advanced feature typically not needed for basic usage. +//! +//! \param t Task handle +//! +//! \pre t must be valid task handle +//! +//! \note Used internally for CUDA graph backend optimization + void stf_task_enable_capture(stf_task_handle t); -typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle; +//! \} + +//! \defgroup CUDAKernel CUDA Kernel Interface +//! \brief Optimized interface for CUDA kernel execution +//! \{ + +//! +//! \brief Create CUDA kernel task +//! +//! Creates a specialized task optimized for CUDA kernel execution. +//! More efficient than generic tasks for repeated kernel launches, +//! especially with CUDA graph backend. +//! +//! \param ctx Context handle +//! \param[out] k Pointer to receive kernel handle +//! +//! \pre ctx must be valid context handle +//! \pre k must not be NULL +//! \post *k contains valid kernel handle +//! +//! \par Example: +//! \code +//! stf_cuda_kernel_handle kernel; +//! stf_cuda_kernel_create(ctx, &kernel); +//! // ... configure kernel ... +//! stf_cuda_kernel_destroy(kernel); +//! \endcode +//! +//! \see stf_cuda_kernel_destroy(), stf_task_create() void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k); -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p); + +//! +//! \brief Set kernel execution place +//! +//! Specifies where the CUDA kernel should execute. +//! +//! \param k Kernel handle +//! \param exec_p Pointer to execution place specification +//! +//! \pre k must be valid kernel handle +//! \pre exec_p must not be NULL +//! +//! \see make_device_place(), stf_task_set_exec_place() + +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p); + +//! +//! \brief Set symbolic name for kernel +//! +//! Associates human-readable name with kernel for debugging. +//! +//! \param k Kernel handle +//! \param symbol Null-terminated string name +//! +//! \pre k must be valid kernel handle +//! \pre symbol must not be NULL +//! +//! \see stf_task_set_symbol(), stf_logical_data_set_symbol() + void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol); + +//! +//! \brief Add data dependency to kernel +//! +//! Adds data dependency with specified access mode for kernel execution. +//! +//! \param k Kernel handle +//! \param ld Logical data handle +//! \param m Access mode (STF_READ, STF_WRITE, STF_RW) +//! +//! \pre k must be valid kernel handle +//! \pre ld must be valid logical data handle +//! \pre m must be valid access mode +//! +//! \see stf_task_add_dep() + void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m); + +//! +//! \brief Start kernel execution +//! +//! Begins kernel execution phase. After this, add kernel descriptions +//! with stf_cuda_kernel_add_desc(). +//! +//! \param k Kernel handle +//! +//! \pre k must be valid kernel handle +//! \pre Dependencies must already be configured +//! +//! \see stf_cuda_kernel_add_desc(), stf_cuda_kernel_end() + void stf_cuda_kernel_start(stf_cuda_kernel_handle k); +//! +//! \brief Add CUDA kernel launch description (driver API) +//! +//! Adds kernel launch specification using CUDA driver API function handle. +//! This is the low-level interface used internally. +//! +//! \param k Kernel handle +//! \param cufunc CUDA driver API function handle +//! \param grid_dim_ CUDA grid dimensions +//! \param block_dim_ CUDA block dimensions +//! \param shared_mem_ Shared memory size in bytes +//! \param arg_cnt Number of kernel arguments +//! \param args Array of pointers to kernel arguments +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre cufunc must be valid CUfunction +//! \pre args must contain arg_cnt valid argument pointers +//! +//! \see stf_cuda_kernel_add_desc() + void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args); -/* Convert CUDA kernel address to CUfunction because we may use them from a - * shared library where this would be invalid in the runtime API. */ -static inline void stf_cuda_kernel_add_desc( +//! +//! \brief Add CUDA kernel launch description +//! +//! Adds kernel launch specification using runtime API function pointer. +//! Automatically converts to driver API internally. +//! +//! \param k Kernel handle +//! \param func Pointer to __global__ function +//! \param grid_dim_ CUDA grid dimensions +//! \param block_dim_ CUDA block dimensions +//! \param shared_mem_ Shared memory size in bytes +//! \param arg_cnt Number of kernel arguments +//! \param args Array of pointers to kernel arguments +//! +//! \return cudaSuccess on success, or appropriate cudaError_t on failure +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre func must be valid __global__ function pointer +//! \pre args must contain arg_cnt valid argument pointers +//! +//! \note Converts function pointer to CUfunction automatically +//! +//! \par Example: +//! \code +//! // Kernel: __global__ void axpy(float alpha, float* x, float* y) +//! stf_cuda_kernel_start(kernel); +//! +//! // Prepare arguments +//! float alpha = 2.0f; +//! float* d_x = (float*)stf_cuda_kernel_get_arg(kernel, 0); +//! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1); +//! const void* args[] = {&alpha, &d_x, &d_y}; +//! +//! // Launch kernel (caller must handle return values != cudaSuccess) +//! cudaError_t err = stf_cuda_kernel_add_desc(kernel, (void*)axpy, +//! dim3(16), dim3(128), 0, 3, args); +//! stf_cuda_kernel_end(kernel); +//! \endcode +//! +//! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg() + +static inline cudaError_t stf_cuda_kernel_add_desc( stf_cuda_kernel_handle k, const void* func, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args) { CUfunction cufunc; - [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); - assert(res == cudaSuccess); + cudaError_t res = cudaGetFuncBySymbol(&cufunc, func); + if (res != cudaSuccess) + { + return res; + } - stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); + return cudaSuccess; } +//! +//! \brief Get kernel argument data pointer +//! +//! Returns pointer to logical data for use as kernel argument. +//! Index corresponds to order of stf_cuda_kernel_add_dep() calls. +//! +//! \param k Kernel handle +//! \param index Dependency index (0-based) +//! \return Pointer to data for kernel argument +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! \pre index must be valid dependency index +//! +//! \see stf_cuda_kernel_add_desc(), stf_task_get() + void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index); + +//! +//! \brief End kernel execution +//! +//! Ends kernel execution phase. Call after all kernel descriptions +//! are added with stf_cuda_kernel_add_desc(). +//! +//! \param k Kernel handle +//! +//! \pre k must be valid kernel handle +//! \pre stf_cuda_kernel_start() must have been called +//! +//! \see stf_cuda_kernel_start() + void stf_cuda_kernel_end(stf_cuda_kernel_handle k); -void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t); + +//! +//! \brief Destroy kernel handle +//! +//! Destroys kernel handle and releases associated resources. +//! +//! \param k Kernel handle to destroy +//! +//! \pre k must be valid kernel handle +//! \post k becomes invalid, resources released +//! +//! \note Must be called for every created kernel +//! +//! \see stf_cuda_kernel_create() + +void stf_cuda_kernel_destroy(stf_cuda_kernel_handle k); + +//! \} #ifdef __cplusplus } diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 0a92d86b677..14a03de7f5a 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // #include #include @@ -6,69 +16,60 @@ using namespace cuda::experimental::stf; extern "C" { -struct stf_ctx_handle_t -{ - context ctx; -}; - -struct stf_logical_data_handle_t -{ - // XXX should we always store a logical_data> instead ? - logical_data_untyped ld; -}; - -struct stf_task_handle_t -{ - context::unified_task<> t; -}; - void stf_ctx_create(stf_ctx_handle* ctx) { assert(ctx); - *ctx = new stf_ctx_handle_t{context{}}; + *ctx = new context{}; } void stf_ctx_create_graph(stf_ctx_handle* ctx) { assert(ctx); - *ctx = new stf_ctx_handle_t{context{graph_ctx()}}; + *ctx = new context{graph_ctx()}; } void stf_ctx_finalize(stf_ctx_handle ctx) { - ctx->ctx.finalize(); assert(ctx); - delete ctx; + auto* context_ptr = static_cast(ctx); + context_ptr->finalize(); + delete context_ptr; } cudaStream_t stf_fence(stf_ctx_handle ctx) { assert(ctx); - return ctx->ctx.fence(); + auto* context_ptr = static_cast(ctx); + return context_ptr->fence(); } void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) { - assert(ld); assert(ctx); + assert(ld); - // Create a slice logical data - auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz)); + auto* context_ptr = static_cast(ctx); + auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz)); - // Stored in its untyped version - *ld = new stf_logical_data_handle_t{ld_typed}; + // Store the logical_data_untyped directly as opaque pointer + *ld = new logical_data_untyped{ld_typed}; } void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) { assert(ld); - ld->ld.set_symbol(symbol); + assert(symbol); + + auto* ld_ptr = static_cast(ld); + ld_ptr->set_symbol(symbol); } void stf_logical_data_destroy(stf_logical_data_handle ld) { assert(ld); - delete ld; + + auto* ld_ptr = static_cast(ld); + delete ld_ptr; } void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to) @@ -76,8 +77,9 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_ assert(ctx); assert(to); - auto ld_typed = ctx->ctx.logical_data(shape_of>(length)); - *to = new stf_logical_data_handle_t{ld_typed}; + auto* context_ptr = static_cast(ctx); + auto ld_typed = context_ptr->logical_data(shape_of>(length)); + *to = new logical_data_untyped{ld_typed}; } void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) @@ -85,57 +87,70 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld) assert(ctx); assert(ld); - *ld = new stf_logical_data_handle_t{ctx->ctx.token()}; + auto* context_ptr = static_cast(ctx); + *ld = new logical_data_untyped{context_ptr->token()}; } /* Convert the C-API stf_exec_place to a C++ exec_place object */ -exec_place to_exec_place(struct stf_exec_place* exec_p) +exec_place to_exec_place(stf_exec_place* exec_p) { - if (exec_p->kind == STF_EXEC_PLACE_HOST) + assert(exec_p); + + switch (exec_p->kind) { - return exec_place::host(); - } + case STF_EXEC_PLACE_HOST: + return exec_place::host(); - assert(exec_p->kind == STF_EXEC_PLACE_DEVICE); - return exec_place::device(exec_p->u.device.dev_id); + case STF_EXEC_PLACE_DEVICE: + return exec_place::device(exec_p->u.device.dev_id); + + default: + assert(false && "Invalid execution place kind"); + return exec_place{}; // invalid exec_place + } } /* Convert the C-API stf_data_place to a C++ data_place object */ -data_place to_data_place(struct stf_data_place* data_p) +data_place to_data_place(stf_data_place* data_p) { assert(data_p); - if (data_p->kind == STF_DATA_PLACE_HOST) + switch (data_p->kind) { - return data_place::host(); - } + case STF_DATA_PLACE_HOST: + return data_place::host(); - if (data_p->kind == STF_DATA_PLACE_MANAGED) - { - return data_place::managed(); - } + case STF_DATA_PLACE_MANAGED: + return data_place::managed(); - if (data_p->kind == STF_DATA_PLACE_AFFINE) - { - return data_place::affine(); - } + case STF_DATA_PLACE_AFFINE: + return data_place::affine(); - assert(data_p->kind == STF_DATA_PLACE_DEVICE); - return data_place::device(data_p->u.device.dev_id); + case STF_DATA_PLACE_DEVICE: + return data_place::device(data_p->u.device.dev_id); + + default: + assert(false && "Invalid data place kind"); + return data_place::invalid(); // invalid data_place + } } void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) { - assert(t); assert(ctx); + assert(t); - *t = new stf_task_handle_t{ctx->ctx.task()}; + auto* context_ptr = static_cast(ctx); + *t = new context::unified_task<>{context_ptr->task()}; } -void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p) +void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) { assert(t); - t->t.set_exec_place(to_exec_place(exec_p)); + assert(exec_p); + + auto* task_ptr = static_cast*>(t); + task_ptr->set_exec_place(to_exec_place(exec_p)); } void stf_task_set_symbol(stf_task_handle t, const char* symbol) @@ -143,7 +158,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol) assert(t); assert(symbol); - t->t.set_symbol(symbol); + auto* task_ptr = static_cast*>(t); + task_ptr->set_symbol(symbol); } void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) @@ -151,54 +167,70 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_ assert(t); assert(ld); - t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m))); + auto* task_ptr = static_cast*>(t); + auto* ld_ptr = static_cast(ld); + task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m))); } void stf_task_add_dep_with_dplace( - stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p) + stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p) { assert(t); assert(ld); assert(data_p); - t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p))); + auto* task_ptr = static_cast*>(t); + auto* ld_ptr = static_cast(ld); + task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m), to_data_place(data_p))); } void* stf_task_get(stf_task_handle t, int index) { assert(t); - auto s = t->t.template get>(index); + + auto* task_ptr = static_cast*>(t); + auto s = task_ptr->template get>(index); return (void*) s.data_handle(); } void stf_task_start(stf_task_handle t) { assert(t); - t->t.start(); + + auto* task_ptr = static_cast*>(t); + task_ptr->start(); } void stf_task_end(stf_task_handle t) { assert(t); - t->t.end(); + + auto* task_ptr = static_cast*>(t); + task_ptr->end(); } void stf_task_enable_capture(stf_task_handle t) { assert(t); - t->t.enable_capture(); + + auto* task_ptr = static_cast*>(t); + task_ptr->enable_capture(); } CUstream stf_task_get_custream(stf_task_handle t) { assert(t); - return (CUstream) t->t.get_stream(); + + auto* task_ptr = static_cast*>(t); + return (CUstream) task_ptr->get_stream(); } void stf_task_destroy(stf_task_handle t) { assert(t); - delete t; + + auto* task_ptr = static_cast*>(t); + delete task_ptr; } /** @@ -218,25 +250,24 @@ void stf_task_destroy(stf_task_handle t) * */ -struct stf_cuda_kernel_handle_t -{ - // return type of ctx.cuda_kernel() - using kernel_type = decltype(::std::declval().cuda_kernel()); - kernel_type k; -}; - void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k) { - assert(k); assert(ctx); + assert(k); - *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()}; + auto* context_ptr = static_cast(ctx); + using kernel_type = decltype(context_ptr->cuda_kernel()); + *k = new kernel_type{context_ptr->cuda_kernel()}; } -void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p) +void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p) { assert(k); - k->k.set_exec_place(to_exec_place(exec_p)); + assert(exec_p); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->set_exec_place(to_exec_place(exec_p)); } void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) @@ -244,7 +275,9 @@ void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol) assert(k); assert(symbol); - k->k.set_symbol(symbol); + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->set_symbol(symbol); } void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m) @@ -252,46 +285,66 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l assert(k); assert(ld); - k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m))); + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + auto* ld_ptr = static_cast(ld); + kernel_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m))); } void stf_cuda_kernel_start(stf_cuda_kernel_handle k) { assert(k); - k->k.start(); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->start(); } void stf_cuda_kernel_add_desc_cufunc( stf_cuda_kernel_handle k, CUfunction cufunc, - dim3 gridDim_, - dim3 blockDim_, - size_t sharedMem_, + dim3 grid_dim_, + dim3 block_dim_, + size_t shared_mem_, int arg_cnt, const void** args) { - cuda_kernel_desc desc; - desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args); + assert(k); - k->k.add_kernel_desc(mv(desc)); + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + + cuda_kernel_desc desc; + desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args); + kernel_ptr->add_kernel_desc(mv(desc)); } void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index) { - auto s = k->k.template get>(index); + assert(k); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + auto s = kernel_ptr->template get>(index); return (void*) s.data_handle(); } void stf_cuda_kernel_end(stf_cuda_kernel_handle k) { assert(k); - k->k.end(); + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(k); + kernel_ptr->end(); } void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t) { assert(t); - delete t; + + using kernel_type = decltype(::std::declval().cuda_kernel()); + auto* kernel_ptr = static_cast(t); + delete kernel_ptr; } } // extern "C" diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt index f5613253a81..5776747de79 100644 --- a/c/experimental/stf/test/CMakeLists.txt +++ b/c/experimental/stf/test/CMakeLists.txt @@ -18,13 +18,6 @@ function(cccl_c_experimental_stf_add_test target_name_var source) CCCL::cudax ) - target_compile_definitions(${target_name} PRIVATE - TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" - TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust" - TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" - TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}" - ) - add_test(NAME ${target_name} COMMAND ${target_name}) endfunction() diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu index b5ba66b0f3a..05c0e7e8620 100644 --- a/c/experimental/stf/test/test_cuda_kernel.cu +++ b/c/experimental/stf/test/test_cuda_kernel.cu @@ -70,7 +70,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]") double* dX = (double*) stf_cuda_kernel_get_arg(k, 0); double* dY = (double*) stf_cuda_kernel_get_arg(k, 1); const void* args[4] = {&N, &alpha, &dX, &dY}; - stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args); + REQUIRE(err == cudaSuccess); stf_cuda_kernel_end(k); stf_cuda_kernel_destroy(k); diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index c7179d2a6fc..9f3e3925afb 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs): for i, a in enumerate(args): # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): - if ctx == None: + if ctx is None: ld = a.get_ld() # This context will be used in the __call__ method itself # so we can create a temporary object from the handle diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 49605ced878..58fbb5e0f78 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -6,7 +6,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.cccl.experimental.stf as cudastf # noqa: E402 @cudastf.jit diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index e2d38308341..acb2e2ec3cb 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.cccl.experimental.stf as cudastf # noqa: E402 class Plaintext: @@ -36,7 +36,6 @@ def print_values(self): with ctx.task( cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: - nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 8adbf5454ed..ad23b57a4c4 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.cccl.experimental.stf as cudastf # noqa: E402 class Plaintext: @@ -36,7 +36,6 @@ def print_values(self): with ctx.task( cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: - nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index a992d26d7d2..7b9050c9694 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -11,7 +11,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 context, data_place, exec_place, diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 8c1349b89e5..b604558695e 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -12,7 +12,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 context, rw, ) @@ -47,14 +47,12 @@ def test_pytorch(): torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): tX, tY = t.tensor_arguments() - tZ = tX * 4 + 1 with ( ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - tX, tZ = t.tensor_arguments() - tZ = tY * 2 - 3 + tX, _ = t.tensor_arguments() ctx.finalize() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index 8e52a72f00a..b1e4de75213 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -5,7 +5,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.cccl.experimental.stf as cudastf # noqa: E402 @cudastf.jit From c00c915f85c894e9216693c322afa8ca032f6d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:05:29 +0200 Subject: [PATCH 165/221] Revert Python linting changes Keep only the C library updates from stf_c_lib merge. Revert all Python file modifications to maintain original code. --- python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 2 +- python/cuda_cccl/tests/stf/test_decorator.py | 2 +- python/cuda_cccl/tests/stf/test_fhe.py | 3 ++- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 3 ++- python/cuda_cccl/tests/stf/test_numba.py | 2 +- python/cuda_cccl/tests/stf/test_pytorch.py | 6 ++++-- python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +- 7 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py index 9f3e3925afb..c7179d2a6fc 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py @@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs): for i, a in enumerate(args): # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): - if ctx is None: + if ctx == None: ld = a.get_ld() # This context will be used in the __call__ method itself # so we can create a temporary object from the handle diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 58fbb5e0f78..49605ced878 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -6,7 +6,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf # noqa: E402 +import cuda.cccl.experimental.stf as cudastf @cudastf.jit diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index acb2e2ec3cb..e2d38308341 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf # noqa: E402 +import cuda.cccl.experimental.stf as cudastf class Plaintext: @@ -36,6 +36,7 @@ def print_values(self): with ctx.task( cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index ad23b57a4c4..8adbf5454ed 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf # noqa: E402 +import cuda.cccl.experimental.stf as cudastf class Plaintext: @@ -36,6 +36,7 @@ def print_values(self): with ctx.task( cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 7b9050c9694..a992d26d7d2 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -11,7 +11,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 +from cuda.cccl.experimental.stf._stf_bindings import ( context, data_place, exec_place, diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index b604558695e..8c1349b89e5 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -12,7 +12,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 +from cuda.cccl.experimental.stf._stf_bindings import ( context, rw, ) @@ -47,12 +47,14 @@ def test_pytorch(): torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): tX, tY = t.tensor_arguments() + tZ = tX * 4 + 1 with ( ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - tX, _ = t.tensor_arguments() + tX, tZ = t.tensor_arguments() + tZ = tY * 2 - 3 ctx.finalize() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index b1e4de75213..8e52a72f00a 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -5,7 +5,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf # noqa: E402 +import cuda.cccl.experimental.stf as cudastf @cudastf.jit From cdd0d85d489222e2cc90bd4239bbbf532a3f2f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:07:52 +0200 Subject: [PATCH 166/221] Fix Python CMakeLists.txt: Update C library feature flags - Replace CCCL_ENABLE_C with CCCL_ENABLE_C_PARALLEL and CCCL_ENABLE_C_EXPERIMENTAL_STF - Align with updated C library configuration from stf_c_lib - Fixes CMake target resolution for cccl.c.parallel and cccl.c.experimental.stf --- python/cuda_cccl/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 0b3f99edc54..31e32d659c4 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,7 +25,8 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() # Build and install C++ library first -set(CCCL_ENABLE_C ON) +set(CCCL_ENABLE_C_PARALLEL ON) +set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) set(CCCL_ENABLE_UNSTABLE ON) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) From afda29fb445faac008e4f7b30ad3b8228e5ec9c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:09:29 +0200 Subject: [PATCH 167/221] Fix Python build: Add missing CCCL_ENABLE_C master flag The c/ directory is only included if CCCL_ENABLE_C is ON. Added master flag alongside specific C library feature flags. --- python/cuda_cccl/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 31e32d659c4..1daf9197976 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,9 +25,10 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() # Build and install C++ library first -set(CCCL_ENABLE_C_PARALLEL ON) -set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) -set(CCCL_ENABLE_UNSTABLE ON) +set(CCCL_ENABLE_C ON) # Master flag to enable c/ directory +set(CCCL_ENABLE_C_PARALLEL ON) # Enable C parallel library +set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library +set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) add_subdirectory(${_cccl_root} _parent_cccl) From 4f1f079a3830472c568ee335d1fa3e970c7f1ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:10:07 +0200 Subject: [PATCH 168/221] Complete STF C library configuration: Enable all C library features and tests - Added CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING for STF tests - Added CCCL_C_PARALLEL_ENABLE_TESTING for parallel tests - Full C library support now enabled for Python builds --- python/cuda_cccl/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 1daf9197976..14f643eee74 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -27,7 +27,9 @@ cccl_build_compiler_targets() # Build and install C++ library first set(CCCL_ENABLE_C ON) # Master flag to enable c/ directory set(CCCL_ENABLE_C_PARALLEL ON) # Enable C parallel library -set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library +set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library +set(CCCL_C_PARALLEL_ENABLE_TESTING ON) # Enable parallel C tests +set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON) # Enable STF C tests set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) From ccfc41d35c079e1a2ee20ffa9557ed03e546510f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:12:30 +0200 Subject: [PATCH 169/221] Remove obsolete CCCL_ENABLE_C flag CCCL_ENABLE_C was replaced by specific flags in stf_c_lib: - c/ directory is now included if CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF - No master CCCL_ENABLE_C flag needed anymore --- python/cuda_cccl/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 14f643eee74..1481430c112 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,9 +25,8 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() # Build and install C++ library first -set(CCCL_ENABLE_C ON) # Master flag to enable c/ directory -set(CCCL_ENABLE_C_PARALLEL ON) # Enable C parallel library -set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library +set(CCCL_ENABLE_C_PARALLEL ON) # Enable C parallel library (triggers c/ directory) +set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory) set(CCCL_C_PARALLEL_ENABLE_TESTING ON) # Enable parallel C tests set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON) # Enable STF C tests set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features From e4b8277af23d5c2df700f41faa455cda01f5dd12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:13:52 +0200 Subject: [PATCH 170/221] Update CMake configuration to match stf_c_lib structure - Replace CCCL_ENABLE_C with specific CCCL_ENABLE_C_PARALLEL and CCCL_ENABLE_C_EXPERIMENTAL_STF flags - Update c/ directory inclusion condition to use OR logic with specific flags - Update CMakePresets.json to match new flag structure - Remove obsolete cccl-c-stf preset (merged functionality) --- CMakeLists.txt | 5 +++-- CMakePresets.json | 33 ++++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0830d733b32..fe53b0daead 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,8 @@ option(CCCL_ENABLE_CUB "Enable the CUB developer build." OFF) option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF) option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF) -option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF) +option(CCCL_ENABLE_C_PARALLEL "Enable CUDA C Parallel Library." OFF) +option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF) if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") set(CCCL_ENABLE_BENCHMARKS OFF) @@ -82,7 +83,7 @@ if (CCCL_ENABLE_UNSTABLE) add_subdirectory(cudax) endif() -if (CCCL_ENABLE_C) +if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF) add_subdirectory(c) endif() diff --git a/CMakePresets.json b/CMakePresets.json index 537e7ebc88a..e4e012c60d2 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -21,7 +21,8 @@ "CCCL_ENABLE_CUDAX": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, - "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_PARALLEL": false, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, "Thrust_ENABLE_INSTALL_RULES": true, @@ -63,7 +64,8 @@ "CCCL_ENABLE_TESTING": true, "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, - "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_PARALLEL": true, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, "CUB_ENABLE_TESTING": true, @@ -88,7 +90,8 @@ "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true, "CCCL_C_Parallel_ENABLE_TESTING": true, - "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true + "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true, + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true } }, { @@ -270,11 +273,22 @@ "displayName": "CCCL C Parallel Library", "inherits": "base", "cacheVariables": { - "CCCL_ENABLE_C": true, + "CCCL_ENABLE_C_PARALLEL": true, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "CCCL_C_Parallel_ENABLE_TESTING": true, "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true } }, + { + "name": "cccl-c-stf", + "displayName": "CCCL C CUDASTF Library", + "inherits": "base", + "cacheVariables": { + "CCCL_ENABLE_C_PARALLEL": false, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": true, + "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true + } + }, { "name": "packaging", "displayName": "CCCL Packaging Tests/Examples", @@ -296,7 +310,7 @@ "CCCL_ENABLE_THRUST": false, "CCCL_ENABLE_LIBCUDACXX": false, "CCCL_ENABLE_CUDAX": false, - "CCCL_ENABLE_C": false, + "CCCL_ENABLE_C_PARALLEL": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "CUB_ENABLE_EXAMPLES": false, @@ -428,6 +442,10 @@ "name": "cccl-c-parallel", "configurePreset": "cccl-c-parallel" }, + { + "name": "cccl-c-stf", + "configurePreset": "cccl-c-stf" + }, { "name": "packaging", "configurePreset": "packaging" @@ -721,6 +739,11 @@ "configurePreset": "cccl-c-parallel", "inherits": "base" }, + { + "name": "cccl-c-stf", + "configurePreset": "cccl-c-stf", + "inherits": "base" + }, { "name": "packaging", "configurePreset": "packaging", From 6931fa8ba61773630920e8ea7f872a88b15ce7bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:16:37 +0200 Subject: [PATCH 171/221] Optimize Python build: Remove unnecessary C parallel library - Set CCCL_ENABLE_C_PARALLEL OFF (not needed for STF Python bindings) - Set testing flags OFF (testing belongs in CI, not Python build) - Remove parallel library installation and binding sections - Keep only STF C library and STF Python bindings - Cleaner, faster Python build focused on STF functionality --- python/cuda_cccl/CMakeLists.txt | 46 ++++----------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 1481430c112..5db487e7eb2 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,12 +25,11 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) cccl_build_compiler_targets() # Build and install C++ library first -set(CCCL_ENABLE_C_PARALLEL ON) # Enable C parallel library (triggers c/ directory) +set(CCCL_ENABLE_C_PARALLEL OFF) # Not needed for STF Python bindings set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory) -set(CCCL_C_PARALLEL_ENABLE_TESTING ON) # Enable parallel C tests -set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON) # Enable STF C tests +set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build +set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features -set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) add_subdirectory(${_cccl_root} _parent_cccl) @@ -60,16 +59,7 @@ install( DESTINATION ${_dest_incl_dir} ) -# ensure the destination directory exists -file(MAKE_DIRECTORY "cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl") - -# Install version-specific binaries -install( - TARGETS cccl.c.parallel - DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl -) - - +# ensure the destination directory exists file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl") install( @@ -112,33 +102,7 @@ endif() set(CYTHON_FLAGS "-3 -M -t -w \"${cuda_cccl_SOURCE_DIR}\"") string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}") -message(STATUS "Using Cython ${CYTHON_VERSION}") -set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/parallel/experimental/_bindings_impl.pyx") - -set(_generated_extension_src "${cuda_cccl_BINARY_DIR}/_bindings_impl.c") -set(_depfile "${cuda_cccl_BINARY_DIR}/_bindings_impl.c.dep") - -# Custom Cython compilation command for version-specific target -add_custom_command( - OUTPUT "${_generated_extension_src}" - COMMAND "${Python3_EXECUTABLE}" -m cython - ARGS ${CYTHON_FLAGS_LIST} "${pyx_source_file}" --output-file "${_generated_extension_src}" - DEPENDS "${pyx_source_file}" - DEPFILE "${_depfile}" - COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" -) - -set_source_files_properties("${_generated_extension_src}" PROPERTIES GENERATED TRUE) -add_custom_target(cythonize_bindings_impl ALL - DEPENDS "${_generated_extension_src}" -) - -Python3_add_library(_bindings_impl MODULE WITH_SOABI "${_generated_extension_src}") -add_dependencies(_bindings_impl cythonize_bindings_impl) -target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver) -set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") - -install(TARGETS _bindings_impl DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}) +# Only building STF bindings - parallel bindings not needed message(STATUS "STF Using Cython ${CYTHON_VERSION}") set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx") From a1a113959ea7ccc47bf35a6f9a7c70a072555cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:20:06 +0200 Subject: [PATCH 172/221] clang-format --- python/cuda_cccl/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 5db487e7eb2..0b34563303e 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -59,7 +59,7 @@ install( DESTINATION ${_dest_incl_dir} ) -# ensure the destination directory exists +# ensure the destination directory exists file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl") install( From ecd9f4e586b94ee6327bb5ea3318f96a7d13bc1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:52:20 +0200 Subject: [PATCH 173/221] fix pytorch example --- python/cuda_cccl/tests/stf/test_pytorch.py | 50 +++++++++++++++++++--- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 8c1349b89e5..3e2d03ac650 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -12,7 +12,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 context, rw, ) @@ -33,31 +33,67 @@ def test_pytorch(): torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): tX = t.tensor_arguments() - tX = tX * 2 + tX[:] = tX * 2 # In-place multiplication with ctx.task(lX.read(), lY.write()) as t: torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) with torch.cuda.stream(torch_stream): tX = t.get_arg_as_tensor(0) tY = t.get_arg_as_tensor(1) - tY = tX * 2 + tY[:] = tX * 2 # Copy result into tY tensor with ( ctx.task(lX.read(), lZ.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - tX, tY = t.tensor_arguments() - tZ = tX * 4 + 1 + tX, tZ = t.tensor_arguments() # Get tX and tZ tensors + tZ[:] = tX * 4 + 1 # Copy result into tZ tensor with ( ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), ): - tX, tZ = t.tensor_arguments() - tZ = tY * 2 - 3 + tY, tZ = t.tensor_arguments() # Get tY and tZ tensors + tZ[:] = tY * 2 - 3 # Copy result into tZ tensor ctx.finalize() + # Verify results on host after finalize + print("Verifying results...") + + # Expected values: + # X: 1.0 -> 2.0 (multiplied by 2) + # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) + # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) + + expected_X = 2.0 + expected_Y = 4.0 + expected_Z = 5.0 + + # Check a few values to verify correctness + assert np.allclose(X[:10], expected_X), ( + f"X mismatch: got {X[:10]}, expected {expected_X}" + ) + assert np.allclose(Y[:10], expected_Y), ( + f"Y mismatch: got {Y[:10]}, expected {expected_Y}" + ) + assert np.allclose(Z[:10], expected_Z), ( + f"Z mismatch: got {Z[:10]}, expected {expected_Z}" + ) + + # Check entire arrays + assert np.all(X == expected_X), ( + f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}" + ) + assert np.all(Y == expected_Y), ( + f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}" + ) + assert np.all(Z == expected_Z), ( + f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}" + ) + + print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}") + if __name__ == "__main__": print("Running CUDASTF examples...") From 4b2ae75316789ab8e4af8f6f4ece9061d588fc7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 9 Sep 2025 13:56:48 +0200 Subject: [PATCH 174/221] use ascii symbols --- python/cuda_cccl/tests/stf/test_numba.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index a992d26d7d2..5e9c83153a0 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -88,19 +88,19 @@ def test_numba(): @cuda.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): """ - Compute a 5‑point Laplacian on u_in and write the result to u_out. + Compute a 5-point Laplacian on u_in and write the result to u_out. - Grid‑stride 2‑D kernel. Assumes C‑contiguous (row‑major) inputs. + Grid-stride 2-D kernel. Assumes C-contiguous (row-major) inputs. Boundary cells are copied unchanged. """ coef_x = 1.0 / (dx * dx) coef_y = 1.0 / (dy * dy) - i, j = cuda.grid(2) # i ↔ row (x‑index), j ↔ col (y‑index) + i, j = cuda.grid(2) # i <-> row (x-index), j <-> col (y-index) nx, ny = u_in.shape if i >= nx or j >= ny: - return # out‑of‑bounds threads do nothing + return # out-of-bounds threads do nothing if 0 < i < nx - 1 and 0 < j < ny - 1: u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + ( From dcb3d39735749c9c0aeab142754e27c04e803f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 10 Sep 2025 07:13:29 +0200 Subject: [PATCH 175/221] Cleanup some changes in the infra from a previous merge --- CMakeLists.txt | 2 -- CMakePresets.json | 1 + ci/matrix.yaml | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5467357fabb..fe53b0daead 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,8 +87,6 @@ if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF) add_subdirectory(c) endif() - - if (CCCL_ENABLE_TESTING) add_subdirectory(test) endif() diff --git a/CMakePresets.json b/CMakePresets.json index 7872b32d57e..475d6852bb3 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -309,6 +309,7 @@ "CCCL_ENABLE_LIBCUDACXX": false, "CCCL_ENABLE_CUDAX": false, "CCCL_ENABLE_C_PARALLEL": false, + "CCCL_ENABLE_C_EXPERIMENTAL_STF": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "CUB_ENABLE_EXAMPLES": false, diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 5a2d9715998..dc9f6fb0115 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -8,9 +8,6 @@ workflows: # - {jobs: ['test'], project: 'thrust', std: 17, ctk: '12.X', cxx: ['gcc12', 'clang16']} # override: - # Python and c/parallel jobs: - - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']} - - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']} pull_request: # Old CTK: Oldest/newest supported host compilers: From 1284eb2d16a1550cab8c32104475806f0b26af8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 10 Sep 2025 08:43:28 +0200 Subject: [PATCH 176/221] Implement logical_data_empty logical_data_zeros, and logical_data_full --- .../experimental/stf/_adapters/numba_utils.py | 84 +++++++++ .../experimental/stf/_stf_bindings_impl.pyx | 163 +++++++++++++++++- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 54 ++---- 3 files changed, 261 insertions(+), 40 deletions(-) create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py new file mode 100644 index 00000000000..cfa8b27a041 --- /dev/null +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py @@ -0,0 +1,84 @@ +""" +Utilities for NUMBA-based STF operations. +""" +import numba +from numba import cuda +import numpy as np + + +def init_logical_data(ctx, ld, value, data_place=None, exec_place=None): + """ + Initialize a logical data with a constant value using CuPy's optimized fill. + + Parameters + ---------- + ctx : context + STF context + ld : logical_data + Logical data to initialize + value : scalar + Value to fill the array with + data_place : data_place, optional + Data place for the initialization task + exec_place : exec_place, optional + Execution place for the fill operation + """ + # Create write dependency with optional data place + dep_arg = ld.write(data_place) if data_place else ld.write() + + # Create task arguments - include exec_place if provided + task_args = [] + if exec_place is not None: + task_args.append(exec_place) + task_args.append(dep_arg) + + with ctx.task(*task_args) as t: + # Get the array as a numba device array + nb_stream = cuda.external_stream(t.stream_ptr()) + array = t.numba_arguments() + + try: + # Use CuPy's optimized operations (much faster than custom kernels) + import cupy as cp + with cp.cuda.Stream(nb_stream): + cp_view = cp.asarray(array) + if value == 0 or value == 0.0: + # Use CuPy's potentially optimized zero operation + cp_view.fill(0) # CuPy may have special optimizations for zero + else: + # Use generic fill for non-zero values + cp_view.fill(value) + except ImportError: + # Fallback to simple kernel if CuPy not available + _fill_with_simple_kernel(array, value, nb_stream) + + +@cuda.jit +def _fill_kernel_fallback(array, value): + """Fallback 1D kernel when CuPy is not available.""" + idx = cuda.grid(1) + if idx < array.size: + array.flat[idx] = value + + +@cuda.jit +def _zero_kernel_fallback(array): + """Optimized fallback kernel for zero-filling when CuPy is not available.""" + idx = cuda.grid(1) + if idx < array.size: + array.flat[idx] = 0 + + +def _fill_with_simple_kernel(array, value, stream): + """Fallback method using simple NUMBA kernel when CuPy unavailable.""" + total_size = array.size + threads_per_block = 256 + blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block + + if value == 0 or value == 0.0: + # Use the specialized zero kernel for potentially better performance + _zero_kernel_fallback[blocks_per_grid, threads_per_block, stream](array) + else: + # Use generic fill kernel for non-zero values + typed_value = array.dtype.type(value) + _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream](array, typed_value) \ No newline at end of file diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index ceca943e155..084ea6476bd 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -523,9 +523,170 @@ cdef class context: """ return logical_data(self, buf) - def logical_data_by_shape(self, shape, dtype): + + def logical_data_empty(self, shape, dtype=None): + """ + Create logical data with uninitialized values. + + Equivalent to numpy.empty() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + + Returns + ------- + logical_data + New logical data with uninitialized values + + Examples + -------- + >>> # Create uninitialized array (fast but contains garbage) + >>> ld = ctx.logical_data_empty((100, 100), dtype=np.float32) + + >>> # Fast allocation without initialization + >>> ld = ctx.logical_data_empty((50, 50, 50)) + """ + if dtype is None: + dtype = np.float64 return logical_data.init_by_shape(self, shape, dtype) + def logical_data_full(self, shape, fill_value, dtype=None, where=None, exec_place=None): + """ + Create logical data initialized with a constant value. + + Similar to numpy.full(), this creates a new logical data with the given + shape and fills it with fill_value. + + Parameters + ---------- + shape : tuple + Shape of the array + fill_value : scalar + Value to fill the array with + dtype : numpy.dtype, optional + Data type. If None, infer from fill_value. + where : data_place, optional + Data placement for initialization. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + Note: exec_place.host() is not yet supported. + + Returns + ------- + logical_data + New logical data initialized with fill_value + + Examples + -------- + >>> # Create array filled with epsilon0 on current device + >>> ld = ctx.logical_data_full((100, 100), 8.85e-12, dtype=np.float64) + + >>> # Create array on host memory + >>> ld = ctx.logical_data_full((50, 50), 1.0, where=data_place.host()) + + >>> # Create array on specific device, execute on device 1 + >>> ld = ctx.logical_data_full((200, 200), 0.0, where=data_place.device(0), + ... exec_place=exec_place.device(1)) + """ + # Infer dtype from fill_value if not provided + if dtype is None: + dtype = np.array(fill_value).dtype + else: + dtype = np.dtype(dtype) + + # Validate exec_place - host execution not yet supported + if exec_place is not None: + if hasattr(exec_place, 'kind') and exec_place.kind == "host": + raise NotImplementedError( + "exec_place.host() is not yet supported for logical_data_full. " + "Use exec_place.device() or omit exec_place parameter." + ) + + # Create empty logical data + ld = self.logical_data_empty(shape, dtype) + + # Initialize with the specified value using NUMBA + # The numba code already handles None properly by calling ld.write() without data place + try: + from cuda.cccl.experimental.stf._adapters.numba_utils import init_logical_data + init_logical_data(self, ld, fill_value, where, exec_place) + except ImportError as e: + raise RuntimeError("NUMBA support is not available for logical_data_full") from e + + return ld + + def logical_data_zeros(self, shape, dtype=None, where=None, exec_place=None): + """ + Create logical data filled with zeros. + + Equivalent to numpy.zeros() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + where : data_place, optional + Data placement. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + + Returns + ------- + logical_data + New logical data filled with zeros + + Examples + -------- + >>> # Create zero-filled array + >>> ld = ctx.logical_data_zeros((100, 100), dtype=np.float32) + + >>> # Create on host memory + >>> ld = ctx.logical_data_zeros((50, 50), where=data_place.host()) + """ + if dtype is None: + dtype = np.float64 + return self.logical_data_full(shape, 0.0, dtype, where, exec_place) + + def logical_data_ones(self, shape, dtype=None, where=None, exec_place=None): + """ + Create logical data filled with ones. + + Equivalent to numpy.ones() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + where : data_place, optional + Data placement. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + + Returns + ------- + logical_data + New logical data filled with ones + + Examples + -------- + >>> # Create ones-filled array + >>> ld = ctx.logical_data_ones((100, 100), dtype=np.float32) + + >>> # Create on specific device + >>> ld = ctx.logical_data_ones((50, 50), exec_place=exec_place.device(1)) + """ + if dtype is None: + dtype = np.float64 + return self.logical_data_full(shape, 1.0, dtype, where, exec_place) + def task(self, *args): """ Create a `task` diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 3d1d6b0d2df..7c3ca2a8215 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -44,18 +44,6 @@ def show_slice(t3d, plane="xy", index=None): plt.pause(0.01) -def init_field(ctx, ld, value): - with ( - ctx.task(ld.write()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), - ): - field = t.tensor_arguments() - if value == 0: - field.zero_() - else: - field.fill_(value) - - def fdtd_3d_pytorch( size_x: int = 150, size_y: int = 150, @@ -74,34 +62,22 @@ def fdtd_3d_pytorch( ]: ctx = context() - # allocate fields + # allocate and initialize fields shape = (size_x, size_y, size_z) - # ex_ = torch.zeros(shape, dtype=dtype, device=device) - lex = ctx.logical_data_by_shape(shape, np.float64) - ley = ctx.logical_data_by_shape(shape, np.float64) - lez = ctx.logical_data_by_shape(shape, np.float64) - - # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device) - # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device) - - lhx = ctx.logical_data_by_shape(shape, np.float64) - lhy = ctx.logical_data_by_shape(shape, np.float64) - lhz = ctx.logical_data_by_shape(shape, np.float64) - - # lepsilon = ctx.logical_data() - # lmu = ctx.logical_data(mu_) - lepsilon = ctx.logical_data_by_shape(shape, np.float64) - lmu = ctx.logical_data_by_shape(shape, np.float64) - - # TODO ctx.full(...) - init_field(ctx, lex, float(0.0)) - init_field(ctx, ley, float(0.0)) - init_field(ctx, lez, float(0.0)) - init_field(ctx, lhx, float(0.0)) - init_field(ctx, lhy, float(0.0)) - init_field(ctx, lhz, float(0.0)) - init_field(ctx, lepsilon, float(epsilon0)) - init_field(ctx, lmu, float(mu0)) + + # Electric field components (initialized to zero) + lex = ctx.logical_data_zeros(shape, dtype=np.float64) + ley = ctx.logical_data_zeros(shape, dtype=np.float64) + lez = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Magnetic field components (initialized to zero) + lhx = ctx.logical_data_zeros(shape, dtype=np.float64) + lhy = ctx.logical_data_zeros(shape, dtype=np.float64) + lhz = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Material properties + lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64) + lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64) # CFL (same formula as example) dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) From 0514f29c7c00fe1b8e08f0273584d16a3f8f455a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 10 Sep 2025 09:44:19 +0200 Subject: [PATCH 177/221] short names for torch.cuda --- .../experimental/stf/_adapters/numba_utils.py | 18 +++---- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 17 +++---- python/cuda_cccl/tests/stf/test_numba.py | 47 +++++++++++++++++++ 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py index cfa8b27a041..280d8f3a55d 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py @@ -1,15 +1,14 @@ """ Utilities for NUMBA-based STF operations. """ -import numba + from numba import cuda -import numpy as np def init_logical_data(ctx, ld, value, data_place=None, exec_place=None): """ Initialize a logical data with a constant value using CuPy's optimized fill. - + Parameters ---------- ctx : context @@ -25,21 +24,22 @@ def init_logical_data(ctx, ld, value, data_place=None, exec_place=None): """ # Create write dependency with optional data place dep_arg = ld.write(data_place) if data_place else ld.write() - + # Create task arguments - include exec_place if provided task_args = [] if exec_place is not None: task_args.append(exec_place) task_args.append(dep_arg) - + with ctx.task(*task_args) as t: # Get the array as a numba device array nb_stream = cuda.external_stream(t.stream_ptr()) array = t.numba_arguments() - + try: # Use CuPy's optimized operations (much faster than custom kernels) import cupy as cp + with cp.cuda.Stream(nb_stream): cp_view = cp.asarray(array) if value == 0 or value == 0.0: @@ -74,11 +74,13 @@ def _fill_with_simple_kernel(array, value, stream): total_size = array.size threads_per_block = 256 blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block - + if value == 0 or value == 0.0: # Use the specialized zero kernel for potentially better performance _zero_kernel_fallback[blocks_per_grid, threads_per_block, stream](array) else: # Use generic fill kernel for non-zero values typed_value = array.dtype.type(value) - _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream](array, typed_value) \ No newline at end of file + _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream]( + array, typed_value + ) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 7c3ca2a8215..2c233eefd76 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -4,6 +4,7 @@ import matplotlib.pyplot as plt import numpy as np import torch +import torch.cuda as tc from cuda.cccl.experimental.stf._stf_bindings import ( context, @@ -108,7 +109,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] with ( ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): ex, hy, hz, epsilon = t.tensor_arguments() ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( @@ -121,7 +122,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] with ( ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): ey, hx, hz, epsilon = t.tensor_arguments() ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( @@ -134,7 +135,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] with ( ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): ez, hx, hy, epsilon = t.tensor_arguments() ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( @@ -147,7 +148,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # source at center cell with ( ctx.task(lez.rw()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): ez = t.tensor_arguments() ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) @@ -157,7 +158,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] with ( ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): hx, ey, ez, mu = t.tensor_arguments() hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( @@ -170,7 +171,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] with ( ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): hy, ex, ez, mu = t.tensor_arguments() hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( @@ -183,7 +184,7 @@ def source(t: float, x: float, y: float, z: float) -> float: # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] with ( ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): hz, ex, ey, mu = t.tensor_arguments() hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( @@ -196,7 +197,7 @@ def source(t: float, x: float, y: float, z: float) -> float: if output_freq > 0 and (n % output_freq) == 0: with ( ctx.task(lez.read()) as t, - torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + tc.stream(tc.ExternalStream(t.stream_ptr())), ): ez = t.tensor_arguments() print(f"{n}\t{ez[cx, cy, cz].item():.6e}") diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 5e9c83153a0..72b9609276f 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -47,6 +47,13 @@ def test_numba_graph(): ctx.finalize() + # Verify results after finalize (data written back to host) + # Expected: scale(2.0, 1.0) = 2.0 + if np.allclose(X, 2.0): + print("✅ Graph test: X values correct: all 2.0") + else: + print(f"❌ Graph test: X values incorrect: expected 2.0, got {X[:5]}...") + def test_numba(): n = 1024 * 1024 @@ -84,6 +91,46 @@ def test_numba(): ctx.finalize() + # Verify results after finalize (data written back to host) + print("Verifying results after finalize:") + + # Expected values: + # X: scale(2.0, 1.0) = 2.0 + # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0 + # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0 + expected_X = 2.0 + expected_Y = 5.0 + expected_Z = 15.0 + + # Check X values + if np.allclose(X, expected_X, rtol=1e-6, atol=1e-6): + print(f"✅ X values correct: all {expected_X}") + else: + actual_x = X[0] if len(X) > 0 else "N/A" + print( + f"❌ X values incorrect: expected {expected_X}, got {actual_x} (diff: {abs(actual_x - expected_X):.2e})" + ) + + # Check Y values + if np.allclose(Y, expected_Y, rtol=1e-6, atol=1e-6): + print(f"✅ Y values correct: all {expected_Y}") + else: + actual_y = Y[0] if len(Y) > 0 else "N/A" + print( + f"❌ Y values incorrect: expected {expected_Y}, got {actual_y} (diff: {abs(actual_y - expected_Y):.2e})" + ) + + # Check Z values + if np.allclose(Z, expected_Z, rtol=1e-6, atol=1e-6): + print(f"✅ Z values correct: all {expected_Z}") + else: + actual_z = Z[0] if len(Z) > 0 else "N/A" + print( + f"❌ Z values incorrect: expected {expected_Z}, got {actual_z} (diff: {abs(actual_z - expected_Z):.2e})" + ) + + print(f"Sample values: X[0]={X[0]}, Y[0]={Y[0]}, Z[0]={Z[0]}") + @cuda.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): From 5e9b4d597d0504b7a97a158ec9ab72d7a72126ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 10 Sep 2025 10:41:11 +0200 Subject: [PATCH 178/221] Introduce pytorch_task --- .../experimental/stf/_stf_bindings_impl.pyx | 80 ++++++++ .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 4 +- .../tests/stf/test_fdtd_pytorch_simplified.py | 186 ++++++++++++++++++ python/cuda_cccl/tests/stf/test_pytorch.py | 69 +++++++ 4 files changed, 337 insertions(+), 2 deletions(-) create mode 100644 python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 084ea6476bd..2d7451f0985 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -473,6 +473,53 @@ cdef class task: self.end() return False +cdef class pytorch_task_context: + """ + Context manager for PyTorch-integrated STF tasks. + + This class automatically handles: + - Task start/end + - PyTorch stream context + - Tensor argument conversion and unpacking + """ + cdef task _task + cdef object _torch_stream_context + + def __cinit__(self, task t): + self._task = t + self._torch_stream_context = None + + def __enter__(self): + # Import torch here since we know it's available (checked in pytorch_task) + import torch.cuda as tc + + # Start the underlying task + self._task.start() + + # Create torch stream context from task stream + torch_stream = tc.ExternalStream(self._task.stream_ptr()) + self._torch_stream_context = tc.stream(torch_stream) + self._torch_stream_context.__enter__() + + # Get tensor arguments and return them + tensors = self._task.tensor_arguments() + + # If only one tensor, return it directly; otherwise return tuple + if isinstance(tensors, tuple): + return tensors + else: + return (tensors,) + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + # Exit torch stream context first + if self._torch_stream_context is not None: + self._torch_stream_context.__exit__(exc_type, exc_val, exc_tb) + finally: + # Always end the task + self._task.end() + return False + cdef class context: cdef stf_ctx_handle _ctx # Is this a context that we have borrowed ? @@ -712,3 +759,36 @@ cdef class context: "Arguments must be dependency objects or an exec_place" ) return t + + def pytorch_task(self, *args): + """ + Create a PyTorch-integrated task that returns tensors directly. + Only available if PyTorch is installed. + + This is a convenience method that combines task creation with automatic + PyTorch stream management and tensor conversion. + + Example + ------- + >>> with ctx.pytorch_task(read(lX), rw(lY)) as (x_tensor, y_tensor): + >>> # Automatic PyTorch stream context and tensor unpacking + >>> y_tensor[:] = x_tensor * 2 + + Returns + ------- + pytorch_task_context : Context manager that yields tensor arguments + """ + # Check if PyTorch is available + try: + import torch + except ImportError: + raise RuntimeError( + "pytorch_task requires PyTorch to be installed. " + "Install PyTorch or use the regular task() method." + ) + + # Create the underlying task + t = self.task(*args) + + # Return a PyTorch-specific context manager + return pytorch_task_context(t) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 2c233eefd76..5712dd74b0a 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -210,6 +210,6 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": - # quick check + # Run FDTD simulation ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) -# print("done; Ez(center) =", ez[50, 50, 50].item()) + print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py new file mode 100644 index 00000000000..a26268f38d2 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -0,0 +1,186 @@ +import math +from typing import Literal, Optional, Tuple + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from cuda.cccl.experimental.stf._stf_bindings import ( + context, +) + +Plane = Literal["xy", "xz", "yz"] + + +def show_slice(t3d, plane="xy", index=None): + # grab a 2D view + if plane == "xy": + idx = t3d.shape[2] // 2 if index is None else index + slice2d = t3d[:, :, idx] + elif plane == "xz": + idx = t3d.shape[1] // 2 if index is None else index + slice2d = t3d[:, idx, :] + elif plane == "yz": + idx = t3d.shape[0] // 2 if index is None else index + slice2d = t3d[idx, :, :] + else: + raise ValueError("plane must be 'xy', 'xz' or 'yz'") + + # move to cpu numpy array + arr = slice2d.detach().cpu().numpy() + + # imshow = "imshow" not "imread" + plt.imshow( + arr, + origin="lower", + cmap="seismic", + vmin=-1e-2, + vmax=1e-2, + # norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) + # norm=LogNorm(vmin=1e-12, vmax=1e-6) + ) + # plt.colorbar() + plt.show(block=False) + plt.pause(0.01) + + +def fdtd_3d_pytorch_simplified( + size_x: int = 150, + size_y: int = 150, + size_z: int = 150, + timesteps: int = 10, + output_freq: int = 0, + dx: float = 0.01, + dy: float = 0.01, + dz: float = 0.01, + epsilon0: float = 8.85e-12, + mu0: float = 1.256e-6, + device: Optional[torch.device] = None, + dtype: torch.dtype = torch.float64, +) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: + """ + FDTD 3D implementation using pytorch_task for simplified syntax. + Demonstrates automatic stream and tensor management. + """ + ctx = context() + + # allocate and initialize fields + shape = (size_x, size_y, size_z) + + # Electric field components (initialized to zero) + lex = ctx.logical_data_zeros(shape, dtype=np.float64) + ley = ctx.logical_data_zeros(shape, dtype=np.float64) + lez = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Magnetic field components (initialized to zero) + lhx = ctx.logical_data_zeros(shape, dtype=np.float64) + lhy = ctx.logical_data_zeros(shape, dtype=np.float64) + lhz = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Material properties + lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64) + lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64) + + # CFL (same formula as example) + dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) + + # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1 + i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1) + i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2) + + # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1 + i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1) + i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None) + + # source location (single cell at center) + cx, cy, cz = size_x // 2, size_y // 10, size_z // 2 + + def source(t: float, x: float, y: float, z: float) -> float: + # sin(k*x - omega*t) with f = 1e9 Hz + pi = math.pi + freq = 1.0e9 + omega = 2.0 * pi * freq + wavelength = 3.0e8 / freq + k = 2.0 * pi / wavelength + return math.sin(k * x - omega * t) + + for n in range(int(timesteps)): + # ------------------------- + # update electric fields (Es) + # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] + with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (ex, hy, hz, epsilon): + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dx) + ) * ( + (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) + - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) + ) + + # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] + with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (ey, hx, hz, epsilon): + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dy) + ) * ( + (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) + - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) + ) + + # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] + with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (ez, hx, hy, epsilon): + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dz) + ) * ( + (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) + - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) + ) + + # source at center cell + with ctx.pytorch_task(lez.rw()) as (ez,): + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + + # ------------------------- + # update magnetic fields (Hs) + # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] + with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as (hx, ey, ez, mu): + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dy) + ) * ( + (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) + - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) + ) + + # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] + with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as (hy, ex, ez, mu): + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dz) + ) * ( + (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) + - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) + ) + + # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] + with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as (hz, ex, ey, mu): + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dx) + ) * ( + (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) + - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) + ) + + if output_freq > 0 and (n % output_freq) == 0: + with ctx.pytorch_task(lez.read()) as (ez,): + print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + show_slice(ez, plane="xy") + + ctx.finalize() + + return ex, ey, ez, hx, hy, hz + + +if __name__ == "__main__": + # Run simplified FDTD simulation using pytorch_task + print("Running FDTD simulation with pytorch_task syntax...") + ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) + print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 3e2d03ac650..d5fab32d58d 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -95,6 +95,75 @@ def test_pytorch(): print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}") +def test_pytorch_task(): + """Test the pytorch_task functionality with simplified syntax""" + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + # Equivalent operations to test_pytorch() but using pytorch_task syntax + + # In-place multiplication using pytorch_task (single tensor) + with ctx.pytorch_task(rw(lX)) as (tX,): + tX[:] = tX * 2 + + # Copy and multiply using pytorch_task (multiple tensors) + with ctx.pytorch_task(lX.read(), lY.write()) as (tX, tY): + tY[:] = tX * 2 + + # Another operation combining tensors + with ctx.pytorch_task(lX.read(), lZ.write()) as (tX, tZ): + tZ[:] = tX * 4 + 1 + + # Final operation with read-write access + with ctx.pytorch_task(lY.read(), lZ.rw()) as (tY, tZ): + tZ[:] = tY * 2 - 3 + + ctx.finalize() + + # Verify results on host after finalize (same as original test) + print("Verifying pytorch_task results...") + + # Expected values: + # X: 1.0 -> 2.0 (multiplied by 2) + # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) + # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) + + expected_X = 2.0 + expected_Y = 4.0 + expected_Z = 5.0 + + # Check a few values to verify correctness + assert np.allclose(X[:10], expected_X), ( + f"X mismatch: got {X[:10]}, expected {expected_X}" + ) + assert np.allclose(Y[:10], expected_Y), ( + f"Y mismatch: got {Y[:10]}, expected {expected_Y}" + ) + assert np.allclose(Z[:10], expected_Z), ( + f"Z mismatch: got {Z[:10]}, expected {expected_Z}" + ) + + # Check entire arrays + assert np.all(X == expected_X), ( + f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}" + ) + assert np.all(Y == expected_Y), ( + f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}" + ) + assert np.all(Z == expected_Z), ( + f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}" + ) + + print(f"✅ All pytorch_task checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}") + + if __name__ == "__main__": print("Running CUDASTF examples...") test_pytorch() From 53a454283972babb894f127298f91c712345caa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 10 Sep 2025 11:37:05 +0200 Subject: [PATCH 179/221] clang-format and some minor comment --- .../tests/stf/test_fdtd_pytorch_simplified.py | 42 ++++++++++++++++--- python/cuda_cccl/tests/stf/test_pytorch.py | 5 +++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index a26268f38d2..fa351714b2b 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -110,7 +110,12 @@ def source(t: float, x: float, y: float, z: float) -> float: # ------------------------- # update electric fields (Es) # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] - with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (ex, hy, hz, epsilon): + with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as ( + ex, + hy, + hz, + epsilon, + ): ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( dt / (epsilon[i_es, j_es, k_es] * dx) ) * ( @@ -119,7 +124,12 @@ def source(t: float, x: float, y: float, z: float) -> float: ) # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] - with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (ey, hx, hz, epsilon): + with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as ( + ey, + hx, + hz, + epsilon, + ): ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( dt / (epsilon[i_es, j_es, k_es] * dy) ) * ( @@ -128,7 +138,12 @@ def source(t: float, x: float, y: float, z: float) -> float: ) # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] - with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (ez, hx, hy, epsilon): + with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as ( + ez, + hx, + hy, + epsilon, + ): ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( dt / (epsilon[i_es, j_es, k_es] * dz) ) * ( @@ -143,7 +158,12 @@ def source(t: float, x: float, y: float, z: float) -> float: # ------------------------- # update magnetic fields (Hs) # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] - with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as (hx, ey, ez, mu): + with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as ( + hx, + ey, + ez, + mu, + ): hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( dt / (mu[i_hs, j_hs, k_hs] * dy) ) * ( @@ -152,7 +172,12 @@ def source(t: float, x: float, y: float, z: float) -> float: ) # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] - with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as (hy, ex, ez, mu): + with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as ( + hy, + ex, + ez, + mu, + ): hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( dt / (mu[i_hs, j_hs, k_hs] * dz) ) * ( @@ -161,7 +186,12 @@ def source(t: float, x: float, y: float, z: float) -> float: ) # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] - with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as (hz, ex, ey, mu): + with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as ( + hz, + ex, + ey, + mu, + ): hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( dt / (mu[i_hs, j_hs, k_hs] * dx) ) * ( diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index d5fab32d58d..4c0d180b407 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -103,6 +103,11 @@ def test_pytorch_task(): Z = np.ones(n, dtype=np.float32) ctx = context() + + # Note: We could use ctx.logical_data_full instead of creating NumPy arrays first + # For example: lX = ctx.logical_data_full((n,), 1.0, dtype=np.float32) + # However, this would create logical data without underlying NumPy arrays, + # so we wouldn't be able to check results after ctx.finalize() in this test lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) From 218fda29665c284d9751de23fd78be55768d2975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 09:13:18 +0200 Subject: [PATCH 180/221] make sure stf python tests are wrapped into functions so that pytest calls them --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 5 +-- .../tests/stf/test_fdtd_pytorch_simplified.py | 6 ++-- python/cuda_cccl/tests/stf/test_fhe.py | 32 ++++++++++++------- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 32 ++++++++++++------- .../tests/stf/test_stencil_decorator.py | 5 +++ 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index 5712dd74b0a..db4c9671c16 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -45,7 +45,7 @@ def show_slice(t3d, plane="xy", index=None): plt.pause(0.01) -def fdtd_3d_pytorch( +def test_fdtd_3d_pytorch( size_x: int = 150, size_y: int = 150, size_z: int = 150, @@ -211,5 +211,6 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run FDTD simulation - ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5) + print("Running FDTD 3D PyTorch example...") + ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch(timesteps=1000, output_freq=5) print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index fa351714b2b..5241911a43f 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -44,7 +44,7 @@ def show_slice(t3d, plane="xy", index=None): plt.pause(0.01) -def fdtd_3d_pytorch_simplified( +def test_fdtd_3d_pytorch_simplified( size_x: int = 150, size_y: int = 150, size_z: int = 150, @@ -212,5 +212,7 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run simplified FDTD simulation using pytorch_task print("Running FDTD simulation with pytorch_task syntax...") - ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) + ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch_simplified( + timesteps=1000, output_freq=5 + ) print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index e2d38308341..7331eeba658 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -142,19 +142,27 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) -ctx = cudastf.context(use_graph=False) +def test_fhe(): + """Test Fully Homomorphic Encryption (FHE) example with logical operations.""" + global ctx # Make ctx accessible to the classes + ctx = cudastf.context(use_graph=False) -vA = [3, 3, 2, 2, 17] -pA = Plaintext(ctx, vA) -pA.set_symbol("A") + vA = [3, 3, 2, 2, 17] + pA = Plaintext(ctx, vA) + pA.set_symbol("A") -vB = [1, 7, 7, 7, 49] -pB = Plaintext(ctx, vB) -pB.set_symbol("B") + vB = [1, 7, 7, 7, 49] + pB = Plaintext(ctx, vB) + pB.set_symbol("B") -eA = pA.encrypt() -eB = pB.encrypt() -out = circuit(eA, eB) + eA = pA.encrypt() + eB = pB.encrypt() + out = circuit(eA, eB) -out.decrypt().print_values() -ctx.finalize() + out.decrypt().print_values() + ctx.finalize() + + +if __name__ == "__main__": + print("Running CUDASTF FHE example...") + test_fhe() diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 8adbf5454ed..0bde583bafe 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -124,19 +124,27 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: return ~((eA | ~eB) & (~eA | eB)) -ctx = cudastf.context(use_graph=False) +def test_fhe_decorator(): + """Test Fully Homomorphic Encryption (FHE) example using @cudastf.jit decorators.""" + global ctx # Make ctx accessible to the classes + ctx = cudastf.context(use_graph=False) -vA = [3, 3, 2, 2, 17] -pA = Plaintext(ctx, vA) -pA.set_symbol("A") + vA = [3, 3, 2, 2, 17] + pA = Plaintext(ctx, vA) + pA.set_symbol("A") -vB = [1, 7, 7, 7, 49] -pB = Plaintext(ctx, vB) -pB.set_symbol("B") + vB = [1, 7, 7, 7, 49] + pB = Plaintext(ctx, vB) + pB.set_symbol("B") -eA = pA.encrypt() -eB = pB.encrypt() -out = circuit(eA, eB) + eA = pA.encrypt() + eB = pB.encrypt() + out = circuit(eA, eB) -out.decrypt().print_values() -ctx.finalize() + out.decrypt().print_values() + ctx.finalize() + + +if __name__ == "__main__": + print("Running CUDASTF FHE decorator example...") + test_fhe_decorator() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index 8e52a72f00a..ebfd71de46e 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -79,3 +79,8 @@ def test_numba2d(): # compare with the GPU result max_abs_diff = np.abs(u_out - u_out_ref).max() print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") + + +if __name__ == "__main__": + print("Running CUDASTF stencil decorator example...") + test_numba2d() From 1f974825fe93b1611041d29e3e44e2c449eb4c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 10:07:30 +0200 Subject: [PATCH 181/221] fix the return values of pytests --- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 5 +---- python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py | 7 +------ 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index db4c9671c16..b5209c9d04c 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -206,11 +206,8 @@ def source(t: float, x: float, y: float, z: float) -> float: ctx.finalize() - return ex, ey, ez, hx, hy, hz - if __name__ == "__main__": # Run FDTD simulation print("Running FDTD 3D PyTorch example...") - ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch(timesteps=1000, output_freq=5) - print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") + test_fdtd_3d_pytorch(timesteps=1000, output_freq=5) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index 5241911a43f..85f7b856bd4 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -206,13 +206,8 @@ def source(t: float, x: float, y: float, z: float) -> float: ctx.finalize() - return ex, ey, ez, hx, hy, hz - if __name__ == "__main__": # Run simplified FDTD simulation using pytorch_task print("Running FDTD simulation with pytorch_task syntax...") - ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch_simplified( - timesteps=1000, output_freq=5 - ) - print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}") + test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) \ No newline at end of file From 7a58d68648bf58c4b12dd28e8cf9095d4573fb0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 13:47:08 +0200 Subject: [PATCH 182/221] Start to experiment with Warp --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 358 ++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/example_fluid_warp.py diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py new file mode 100644 index 00000000000..abd93d9e6e8 --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -0,0 +1,358 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +########################################################################### +# Example Fluid +# +# Shows how to implement a simple 2D Stable Fluids solver using +# multidimensional arrays and launches. +# +########################################################################### + +import math + +import warp as wp +import warp.render + +import cuda.cccl.experimental.stf as cudastf + +def stf_kernel(pyfunc): + # let warp decorate normally + kernel = wp.kernel(pyfunc) + + # attach an STF-aware call operator + def _stf_call(*args, dim=None, stream=None, **kwargs): + print(f"[STF TRACE] {pyfunc.__name__}") + print(f" dim={dim}, stream={stream}, args={args}, kwargs={kwargs}") + return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs) + + # monkey-patch a method onto the kernel object + kernel.stf = _stf_call + + return kernel + +def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): + print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") + print(f" dim = {dim}") + print(f" stream = {stream}") + print(f" inputs = {inputs}") + print(f" kwargs = {kwargs}") + + # just forward to warp for now + return wp.launch( + kernel, + dim=dim, + inputs=inputs, + stream=stream, + **kwargs, + ) + + +# put it under wp.stf +if not hasattr(wp, "stf"): + class _stf: + pass + wp.stf = _stf() + + +wp.stf.kernel = stf_kernel +wp.stf.launch = stf_launch + +grid_width = wp.constant(256) +grid_height = wp.constant(128) + + +@wp.func +def lookup_float(f: wp.array2d(dtype=float), x: int, y: int): + x = wp.clamp(x, 0, grid_width - 1) + y = wp.clamp(y, 0, grid_height - 1) + + return f[x, y] + + +@wp.func +def sample_float(f: wp.array2d(dtype=float), x: float, y: float): + lx = int(wp.floor(x)) + ly = int(wp.floor(y)) + + tx = x - float(lx) + ty = y - float(ly) + + s0 = wp.lerp(lookup_float(f, lx, ly), lookup_float(f, lx + 1, ly), tx) + s1 = wp.lerp(lookup_float(f, lx, ly + 1), lookup_float(f, lx + 1, ly + 1), tx) + + s = wp.lerp(s0, s1, ty) + return s + + +@wp.func +def lookup_vel(f: wp.array2d(dtype=wp.vec2), x: int, y: int): + if x < 0 or x >= grid_width: + return wp.vec2() + if y < 0 or y >= grid_height: + return wp.vec2() + + return f[x, y] + + +@wp.func +def sample_vel(f: wp.array2d(dtype=wp.vec2), x: float, y: float): + lx = int(wp.floor(x)) + ly = int(wp.floor(y)) + + tx = x - float(lx) + ty = y - float(ly) + + s0 = wp.lerp(lookup_vel(f, lx, ly), lookup_vel(f, lx + 1, ly), tx) + s1 = wp.lerp(lookup_vel(f, lx, ly + 1), lookup_vel(f, lx + 1, ly + 1), tx) + + s = wp.lerp(s0, s1, ty) + return s + + +@wp.stf.kernel +def advect( + u0: wp.array2d(dtype=wp.vec2), + u1: wp.array2d(dtype=wp.vec2), + rho0: wp.array2d(dtype=float), + rho1: wp.array2d(dtype=float), + dt: float, +): + i, j = wp.tid() + + u = u0[i, j] + + # trace backward + p = wp.vec2(float(i), float(j)) + p = p - u * dt + + # advect + u1[i, j] = sample_vel(u0, p[0], p[1]) + rho1[i, j] = sample_float(rho0, p[0], p[1]) + + +@wp.stf.kernel +def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)): + i, j = wp.tid() + + if i == grid_width - 1: + return + if j == grid_height - 1: + return + + dx = (u[i + 1, j][0] - u[i, j][0]) * 0.5 + dy = (u[i, j + 1][1] - u[i, j][1]) * 0.5 + + div[i, j] = dx + dy + + +@wp.stf.kernel +def pressure_solve(p0: wp.array2d(dtype=float), p1: wp.array2d(dtype=float), div: wp.array2d(dtype=float)): + i, j = wp.tid() + + s1 = lookup_float(p0, i - 1, j) + s2 = lookup_float(p0, i + 1, j) + s3 = lookup_float(p0, i, j - 1) + s4 = lookup_float(p0, i, j + 1) + + # Jacobi update + err = s1 + s2 + s3 + s4 - div[i, j] + + p1[i, j] = err * 0.25 + + +@wp.stf.kernel +def pressure_apply(p: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2)): + i, j = wp.tid() + + if i == 0 or i == grid_width - 1: + return + if j == 0 or j == grid_height - 1: + return + + # pressure gradient + f_p = wp.vec2(p[i + 1, j] - p[i - 1, j], p[i, j + 1] - p[i, j - 1]) * 0.5 + + u[i, j] = u[i, j] - f_p + + +@wp.stf.kernel +def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: float): + i, j = wp.tid() + + # gravity + f_g = wp.vec2(-90.8, 0.0) * rho[i, j] + + # integrate + u[i, j] = u[i, j] + dt * f_g + + # fade + rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt) + + +@wp.stf.kernel +def init(rho: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2), radius: int, dir: wp.vec2): + i, j = wp.tid() + + d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2))) + + if d < radius: + rho[i, j] = 1.0 + u[i, j] = dir + + +class Example: + def __init__(self): + fps = 60 + self.frame_dt = 1.0 / fps + self.sim_substeps = 2 + self.iterations = 100 # Number of pressure iterations + self.sim_dt = self.frame_dt / self.sim_substeps + self.sim_time = 0.0 + + self._stf_ctx = cudastf.context() + + shape = (grid_width, grid_height) + + self.u0 = wp.zeros(shape, dtype=wp.vec2) + self.u1 = wp.zeros(shape, dtype=wp.vec2) + + self.rho0 = wp.zeros(shape, dtype=float) + self.rho1 = wp.zeros(shape, dtype=float) + + self.p0 = wp.zeros(shape, dtype=float) + self.p1 = wp.zeros(shape, dtype=float) + self.div = wp.zeros(shape, dtype=float) + + self.u0._stf_ld = self._stf_ctx.logical_data(self.u0) + self.u0._name = "u0" + + self.u1._name = "u1" + + self.rho0._name = "rho0" + self.rho1._name = "rho1" + + self.p0._name = "p0" + self.p1._name = "p1" + self.div._name = "div" + + # capture pressure solve as a CUDA graph + self.use_cuda_graph = wp.get_device().is_cuda + if self.use_cuda_graph: + with wp.ScopedCapture() as capture: + self.pressure_iterations() + self.graph = capture.graph + + def step(self): + with wp.ScopedTimer("step"): + for _ in range(self.sim_substeps): + shape = (grid_width, grid_height) + dt = self.sim_dt + + speed = 400.0 + angle = math.sin(self.sim_time * 4.0) * 1.5 + vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed) + + # update emitters + wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel]) + + # force integrate + wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt]) + wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div]) + + # pressure solve + self.p0.zero_() + self.p1.zero_() + + # if self.use_cuda_graph: + #  wp.capture_launch(self.graph) + # else: + #  self.pressure_iterations() + self.pressure_iterations() + + # velocity update + wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0]) + + # semi-Lagrangian advection + wp.stf.launch(advect, dim=shape, inputs=[self.u0, self.u1, self.rho0, self.rho1, dt]) + + # swap buffers + (self.u0, self.u1) = (self.u1, self.u0) + (self.rho0, self.rho1) = (self.rho1, self.rho0) + + self.sim_time += dt + + def pressure_iterations(self): + for _ in range(self.iterations): + wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]) + + # swap pressure fields + (self.p0, self.p1) = (self.p1, self.p0) + + def step_and_render_frame(self, frame_num=None, img=None): + self.step() + + with wp.ScopedTimer("render"): + if img: + img.set_array(self.rho0.numpy()) + + return (img,) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.") + parser.add_argument("--num_frames", type=int, default=100000, help="Total number of frames.") + parser.add_argument( + "--headless", + action="store_true", + help="Run in headless mode, suppressing the opening of any graphical windows.", + ) + + args = parser.parse_known_args()[0] + + with wp.ScopedDevice(args.device): + example = Example() + + if args.headless: + for _ in range(args.num_frames): + example.step() + else: + import matplotlib + import matplotlib.animation as anim + import matplotlib.pyplot as plt + + fig = plt.figure() + + img = plt.imshow( + example.rho0.numpy(), + origin="lower", + animated=True, + interpolation="antialiased", + ) + img.set_norm(matplotlib.colors.Normalize(0.0, 1.0)) + seq = anim.FuncAnimation( + fig, + example.step_and_render_frame, + fargs=(img,), + frames=args.num_frames, + blit=True, + interval=8, + repeat=False, + ) + + plt.show() From 9fb1c26265e2c8c88cbbdd5b5cd76907fc87a400 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 15:24:23 +0200 Subject: [PATCH 183/221] logical_data in python are now initialized with a data place, and they can be initialized from a buffer with a CAI interface to support Warp for example --- .../stf/include/cccl/c/experimental/stf/stf.h | 67 ++++++- c/experimental/stf/src/stf.cu | 34 +++- .../experimental/stf/_stf_bindings_impl.pyx | 116 +++++++++++-- python/cuda_cccl/pyproject.toml | 2 +- .../cuda_cccl/tests/stf/example_fluid_warp.py | 164 ++++++++++++++++-- .../tests/stf/test_fdtd_pytorch_simplified.py | 2 +- 6 files changed, 347 insertions(+), 38 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 903b71cd878..49ae71098af 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -456,36 +456,87 @@ cudaStream_t stf_fence(stf_ctx_handle ctx); //! //! \brief Create logical data from existing memory buffer //! -//! Creates logical data handle from an existing host memory buffer. -//! STF takes ownership of data management during task execution. +//! Creates logical data handle from existing memory buffer, assuming host data place. +//! This is a convenience wrapper around stf_logical_data_with_place() with host placement. //! //! \param ctx Context handle //! \param[out] ld Pointer to receive logical data handle -//! \param addr Pointer to existing data buffer +//! \param addr Pointer to existing data buffer (assumed to be host memory) //! \param sz Size of data in bytes //! //! \pre ctx must be valid context handle //! \pre ld must not be NULL -//! \pre addr must not be NULL +//! \pre addr must not be NULL and point to host-accessible memory //! \pre sz must be greater than 0 //! \post *ld contains valid logical data handle //! -//! \note Original data pointer should not be accessed during task execution -//! \note Data will be written back when logical data is destroyed or context finalized +//! \note This function assumes host memory. For device/managed memory, use stf_logical_data_with_place() +//! \note Equivalent to: stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place()) //! //! \par Example: //! \code //! float data[1024]; //! stf_logical_data_handle ld; -//! stf_logical_data(ctx, &ld, data, sizeof(data)); +//! stf_logical_data(ctx, &ld, data, sizeof(data)); // Assumes host memory //! // ... use in tasks ... //! stf_logical_data_destroy(ld); //! \endcode //! -//! \see stf_logical_data_empty(), stf_logical_data_destroy() +//! \see stf_logical_data_with_place(), stf_logical_data_empty(), stf_logical_data_destroy() void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); +//! +//! \brief Create logical data handle from address with data place specification [PRIMARY API] +//! +//! Creates logical data handle from existing memory buffer, explicitly specifying where +//! the memory is located (host, device, managed, etc.). This is the primary and recommended +//! logical data creation function as it provides STF with essential memory location information +//! for optimal data movement and placement strategies. +//! +//! \param ctx Context handle +//! \param[out] ld Pointer to receive logical data handle +//! \param addr Pointer to existing memory buffer +//! \param sz Size of buffer in bytes +//! \param dplace Data place specifying memory location +//! +//! \pre ctx must be valid context handle +//! \pre ld must be valid pointer to logical data handle pointer +//! \pre addr must point to valid memory of at least sz bytes +//! \pre sz must be greater than 0 +//! \pre dplace must be valid data place (not invalid) +//! +//! \post *ld contains valid logical data handle on success +//! \post Caller owns returned handle (must call stf_logical_data_destroy()) +//! +//! \par Examples: +//! \code +//! // GPU device memory (recommended for CUDA arrays) +//! float* device_ptr; +//! cudaMalloc(&device_ptr, 1000 * sizeof(float)); +//! stf_data_place dplace = make_device_data_place(0); +//! stf_logical_data_handle ld; +//! stf_logical_data_with_place(ctx, &ld, device_ptr, 1000 * sizeof(float), dplace); +//! +//! // Host memory +//! float* host_data = new float[1000]; +//! stf_data_place host_place = make_host_data_place(); +//! stf_logical_data_handle ld_host; +//! stf_logical_data_with_place(ctx, &ld_host, host_data, 1000 * sizeof(float), host_place); +//! +//! // Managed memory +//! float* managed_ptr; +//! cudaMallocManaged(&managed_ptr, 1000 * sizeof(float)); +//! stf_data_place managed_place = make_managed_data_place(); +//! stf_logical_data_handle ld_managed; +//! stf_logical_data_with_place(ctx, &ld_managed, managed_ptr, 1000 * sizeof(float), managed_place); +//! \endcode +//! +//! \see make_device_data_place(), make_host_data_place(), make_managed_data_place() + +void stf_logical_data_with_place( + stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace); + //! //! \brief Set symbolic name for logical data //! diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index c08a88b77e1..c601be20e26 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -44,12 +44,44 @@ cudaStream_t stf_fence(stf_ctx_handle ctx) } void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) +{ + // Convenience wrapper: assume host memory + stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place()); +} + +void stf_logical_data_with_place( + stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace) { assert(ctx); assert(ld); auto* context_ptr = static_cast(ctx); - auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz)); + + // Convert C data_place to C++ data_place + cuda::experimental::stf::data_place cpp_dplace; + switch (dplace.kind) + { + case STF_DATA_PLACE_HOST: + cpp_dplace = cuda::experimental::stf::data_place::host(); + break; + case STF_DATA_PLACE_DEVICE: + cpp_dplace = cuda::experimental::stf::data_place::device(dplace.u.device.dev_id); + break; + case STF_DATA_PLACE_MANAGED: + cpp_dplace = cuda::experimental::stf::data_place::managed(); + break; + case STF_DATA_PLACE_AFFINE: + cpp_dplace = cuda::experimental::stf::data_place::affine(); + break; + default: + // Invalid data place - this should not happen with valid input + assert(false && "Invalid data_place kind"); + cpp_dplace = cuda::experimental::stf::data_place::host(); // fallback + break; + } + + // Create logical data with the specified data place + auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz), cpp_dplace); // Store the logical_data_untyped directly as opaque pointer *ld = new logical_data_untyped{ld_typed}; diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx index 2d7451f0985..f8551b83da8 100644 --- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx @@ -114,6 +114,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h": ctypedef struct stf_logical_data_handle_t ctypedef stf_logical_data_handle_t* stf_logical_data_handle void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) + void stf_logical_data_with_place(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace) void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) void stf_logical_data_destroy(stf_logical_data_handle ld) void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) @@ -168,8 +169,9 @@ cdef class logical_data: cdef tuple _shape cdef int _ndim cdef size_t _len + cdef str _symbol # Store symbol for display purposes - def __cinit__(self, context ctx=None, object buf=None, shape=None, dtype=None): + def __cinit__(self, context ctx=None, object buf=None, data_place dplace=None, shape=None, dtype=None): if ctx is None or buf is None: # allow creation via __new__ (eg. in like_empty) self._ld = NULL @@ -178,28 +180,90 @@ cdef class logical_data: self._dtype = None self._shape = () self._ndim = 0 + self._symbol = None return + self._ctx = ctx._ctx + self._symbol = None # Initialize symbol + + # Default to host data place if not specified (matches C++ API) + if dplace is None: + dplace = data_place.host() + + # Try CUDA Array Interface first + if hasattr(buf, '__cuda_array_interface__'): + cai = buf.__cuda_array_interface__ + + # Extract CAI information + data_ptr, readonly = cai['data'] + original_shape = cai['shape'] + typestr = cai['typestr'] + + # Handle vector types automatically (e.g., wp.vec2, wp.vec3) + # STF treats these as flat scalar arrays with an additional dimension + if typestr.startswith('|V'): # Vector type (e.g., '|V8' for vec2, '|V12' for vec3) + vector_size = int(typestr[2:]) # Extract size from '|V8' -> 8 bytes + + if vector_size == 8: # vec2 (2 * 4 bytes float32) + self._shape = original_shape + (2,) + self._dtype = np.dtype(' {self._dtype} with shape {self._shape}") + else: + # Regular scalar type + self._shape = original_shape + self._dtype = np.dtype(typestr) + + self._ndim = len(self._shape) + + # Calculate total size in bytes + itemsize = self._dtype.itemsize + total_items = 1 + for dim in self._shape: + total_items *= dim + self._len = total_items * itemsize + + # Create STF logical data using the new C API with data place specification + stf_logical_data_with_place(ctx._ctx, &self._ld, data_ptr, self._len, dplace._c_place) + return + + # Fallback to Python buffer protocol cdef Py_buffer view cdef int flags = PyBUF_FORMAT | PyBUF_ND # request dtype + shape - self._ctx = ctx._ctx - if PyObject_GetBuffer(buf, &view, flags) != 0: - raise ValueError("object doesn’t support the full buffer protocol") + raise ValueError("object doesn't support the full buffer protocol or __cuda_array_interface__") try: self._ndim = view.ndim self._len = view.len self._shape = tuple(view.shape[i] for i in range(view.ndim)) self._dtype = np.dtype(view.format) - stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len) + # For buffer protocol objects, use the specified data place (defaults to host) + stf_logical_data_with_place(ctx._ctx, &self._ld, view.buf, view.len, dplace._c_place) finally: PyBuffer_Release(&view) + def set_symbol(self, str name): stf_logical_data_set_symbol(self._ld, name.encode()) + self._symbol = name # Store locally for retrieval + + @property + def symbol(self): + """Get the symbol name of this logical data, if set.""" + return self._symbol def __dealloc__(self): if self._ld != NULL: @@ -240,6 +304,7 @@ cdef class logical_data: out._shape = self._shape out._ndim = self._ndim out._len = self._len + out._symbol = None # New object has no symbol initially return out @@ -254,6 +319,7 @@ cdef class logical_data: out._shape = shape out._ndim = len(shape) out._len = math.prod(shape) * out._dtype.itemsize + out._symbol = None # New object has no symbol initially stf_logical_data_empty(ctx._ctx, out._len, &out._ld) return out @@ -559,16 +625,46 @@ cdef class context: stf_ctx_finalize(self._ctx) self._ctx = NULL - def logical_data(self, object buf): + def logical_data(self, object buf, data_place dplace=None): """ - Create and return a `logical_data` object bound to this context. + Create and return a `logical_data` object bound to this context [PRIMARY API]. + + This is the primary function for creating logical data from existing buffers. + It supports both Python buffer protocol objects and CUDA Array Interface objects, + with explicit data_place specification for optimal STF data movement strategies. Parameters ---------- - buf : any buffer‑supporting Python object - (NumPy array, bytes, bytearray, memoryview, …) + buf : any buffer‑supporting Python object or __cuda_array_interface__ object + (NumPy array, Warp array, CuPy array, bytes, bytearray, memoryview, …) + dplace : data_place, optional + Specifies where the buffer is located (host, device, managed, affine). + Defaults to data_place.host() for backward compatibility. + Essential for GPU arrays - use data_place.device() for optimal performance. + + Examples + -------- + >>> # Host memory (explicit - recommended) + >>> host_place = data_place.host() + >>> ld = ctx.logical_data(numpy_array, host_place) + >>> + >>> # GPU device memory (recommended for CUDA arrays) + >>> device_place = data_place.device(0) + >>> ld = ctx.logical_data(warp_array, device_place) + >>> + >>> # Managed/unified memory + >>> managed_place = data_place.managed() + >>> ld = ctx.logical_data(unified_array, managed_place) + >>> + >>> # Backward compatibility (defaults to host) + >>> ld = ctx.logical_data(numpy_array) # Same as specifying host + + Note + ---- + For GPU arrays (Warp, CuPy, etc.), always specify data_place.device() + for zero-copy performance and correct memory management. """ - return logical_data(self, buf) + return logical_data(self, buf, dplace) def logical_data_empty(self, shape, dtype=None): diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 317606e7ad3..4b88822ab41 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cuda-pathfinder>=1.2.3", "cuda-core", "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory", - "llvmlite==0.44", # TODO: remove this once numba-cuda 0.19.2 is released + "llvmlite==0.44", # TODO: remove this once numba-cuda 0.19.2 is released ] dynamic = ["version"] diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index abd93d9e6e8..7797d30a2d5 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -28,6 +28,7 @@ import cuda.cccl.experimental.stf as cudastf + def stf_kernel(pyfunc): # let warp decorate normally kernel = wp.kernel(pyfunc) @@ -35,7 +36,46 @@ def stf_kernel(pyfunc): # attach an STF-aware call operator def _stf_call(*args, dim=None, stream=None, **kwargs): print(f"[STF TRACE] {pyfunc.__name__}") - print(f" dim={dim}, stream={stream}, args={args}, kwargs={kwargs}") + print(f" dim={dim}, stream={stream}") + + # Enhanced arg display with logical data detection + if args: + print(" args=[") + for i, arg in enumerate(args): + # Detect if argument is or contains STF logical data + is_logical_data = False + symbol = None + + # Check if arg is directly STF logical data + if hasattr(arg, "__class__") and "logical_data" in str(type(arg)): + is_logical_data = True + if hasattr(arg, "symbol") and arg.symbol: + symbol = arg.symbol + # Check if arg has attached STF logical data (Warp array) + elif hasattr(arg, "_stf_ld"): + is_logical_data = True + if hasattr(arg._stf_ld, "symbol") and arg._stf_ld.symbol: + symbol = arg._stf_ld.symbol + # Fallback to _name for Warp arrays + elif hasattr(arg, "_name") and arg._name: + symbol = arg._name + + if is_logical_data: + if symbol: + print(f" [{i}]: '{symbol}' [logical_data]") + else: + print(f" [{i}]: logical_data") + else: + # Regular arguments (scalars, etc.) + if hasattr(arg, "shape"): # Array-like but not logical data + print(f" [{i}]: {type(arg).__name__}") + else: # Scalar value + print(f" [{i}]: {arg}") + print(" ]") + else: + print(f" args={args}") + + print(f" kwargs={kwargs}") return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs) # monkey-patch a method onto the kernel object @@ -43,11 +83,49 @@ def _stf_call(*args, dim=None, stream=None, **kwargs): return kernel + def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") print(f" dim = {dim}") print(f" stream = {stream}") - print(f" inputs = {inputs}") + + # Enhanced input display with logical data detection + if inputs: + print(" inputs = [") + for i, inp in enumerate(inputs): + # Detect if input is or contains STF logical data + is_logical_data = False + symbol = None + + # Check if inp is directly STF logical data + if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): + is_logical_data = True + if hasattr(inp, "symbol") and inp.symbol: + symbol = inp.symbol + # Check if inp has attached STF logical data (Warp array) + elif hasattr(inp, "_stf_ld"): + is_logical_data = True + if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: + symbol = inp._stf_ld.symbol + # Fallback to _name for Warp arrays + elif hasattr(inp, "_name") and inp._name: + symbol = inp._name + + if is_logical_data: + if symbol: + print(f" [{i}]: '{symbol}' [logical_data]") + else: + print(f" [{i}]: logical_data") + else: + # Regular arguments (scalars, etc.) + if hasattr(inp, "shape"): # Array-like but not logical data + print(f" [{i}]: {type(inp).__name__}") + else: # Scalar value + print(f" [{i}]: {inp}") + print(" ]") + else: + print(f" inputs = {inputs}") + print(f" kwargs = {kwargs}") # just forward to warp for now @@ -62,8 +140,10 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): # put it under wp.stf if not hasattr(wp, "stf"): + class _stf: pass + wp.stf = _stf() @@ -159,7 +239,11 @@ def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)): @wp.stf.kernel -def pressure_solve(p0: wp.array2d(dtype=float), p1: wp.array2d(dtype=float), div: wp.array2d(dtype=float)): +def pressure_solve( + p0: wp.array2d(dtype=float), + p1: wp.array2d(dtype=float), + div: wp.array2d(dtype=float), +): i, j = wp.tid() s1 = lookup_float(p0, i - 1, j) @@ -203,7 +287,12 @@ def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: fl @wp.stf.kernel -def init(rho: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2), radius: int, dir: wp.vec2): +def init( + rho: wp.array2d(dtype=float), + u: wp.array2d(dtype=wp.vec2), + radius: int, + dir: wp.vec2, +): i, j = wp.tid() d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2))) @@ -236,14 +325,43 @@ def __init__(self): self.p1 = wp.zeros(shape, dtype=float) self.div = wp.zeros(shape, dtype=float) - self.u0._stf_ld = self._stf_ctx.logical_data(self.u0) + # Create STF logical data from Warp arrays with explicit data place + # Warp arrays are on GPU device memory, so specify data_place.device() + + # For regular float arrays, specify device data place + device_place = cudastf.data_place.device(0) + + self.rho0._stf_ld = self._stf_ctx.logical_data(self.rho0, device_place) + self.rho1._stf_ld = self._stf_ctx.logical_data(self.rho1, device_place) + self.p0._stf_ld = self._stf_ctx.logical_data(self.p0, device_place) + self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place) + self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place) + + # vec2 arrays - STF now automatically handles vector type flattening + # Store STF logical data consistently with other arrays + self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place) + self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place) + print( + "✅ Successfully created vec2 STF logical data (automatically flattened by STF)!" + ) + + print("✅ All arrays created with explicit data place specification!") + + # Set descriptive symbols for STF logical data (for enhanced tracing) + self.rho0._stf_ld.set_symbol("density_current") + self.rho1._stf_ld.set_symbol("density_next") + self.p0._stf_ld.set_symbol("pressure_current") + self.p1._stf_ld.set_symbol("pressure_next") + self.div._stf_ld.set_symbol("velocity_divergence") + self.u0._stf_ld.set_symbol("velocity_current") + self.u1._stf_ld.set_symbol("velocity_next") + print("✅ Set descriptive symbols for STF logical data!") + + # Set Warp array names (for Warp tracing) self.u0._name = "u0" - self.u1._name = "u1" - self.rho0._name = "rho0" self.rho1._name = "rho1" - self.p0._name = "p0" self.p1._name = "p1" self.div._name = "div" @@ -276,17 +394,21 @@ def step(self): self.p0.zero_() self.p1.zero_() - # if self.use_cuda_graph: - #  wp.capture_launch(self.graph) - # else: - #  self.pressure_iterations() + # if self.use_cuda_graph: + # wp.capture_launch(self.graph) + # else: + # self.pressure_iterations() self.pressure_iterations() # velocity update wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0]) # semi-Lagrangian advection - wp.stf.launch(advect, dim=shape, inputs=[self.u0, self.u1, self.rho0, self.rho1, dt]) + wp.stf.launch( + advect, + dim=shape, + inputs=[self.u0, self.u1, self.rho0, self.rho1, dt], + ) # swap buffers (self.u0, self.u1) = (self.u1, self.u0) @@ -296,7 +418,9 @@ def step(self): def pressure_iterations(self): for _ in range(self.iterations): - wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]) + wp.stf.launch( + pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div] + ) # swap pressure fields (self.p0, self.p1) = (self.p1, self.p0) @@ -314,9 +438,15 @@ def step_and_render_frame(self, frame_num=None, img=None): if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.") - parser.add_argument("--num_frames", type=int, default=100000, help="Total number of frames.") + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--device", type=str, default=None, help="Override the default Warp device." + ) + parser.add_argument( + "--num_frames", type=int, default=100000, help="Total number of frames." + ) parser.add_argument( "--headless", action="store_true", diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index 85f7b856bd4..659fc43bfd4 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -210,4 +210,4 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run simplified FDTD simulation using pytorch_task print("Running FDTD simulation with pytorch_task syntax...") - test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) \ No newline at end of file + test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) From 5c1d50e46be335c72636e0b88061d13e2aa61f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 20:56:54 +0200 Subject: [PATCH 184/221] Save WIP: add access modes --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 217 ++++++++++++++---- 1 file changed, 168 insertions(+), 49 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index 7797d30a2d5..56c7800e1ea 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -89,50 +89,100 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): print(f" dim = {dim}") print(f" stream = {stream}") - # Enhanced input display with logical data detection + # Process STF dependencies and extract arrays for wp.launch + processed_inputs = [] + stf_dependencies = [] + + # Enhanced input display with STF dependency detection if inputs: print(" inputs = [") for i, inp in enumerate(inputs): - # Detect if input is or contains STF logical data - is_logical_data = False - symbol = None - - # Check if inp is directly STF logical data - if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): - is_logical_data = True - if hasattr(inp, "symbol") and inp.symbol: - symbol = inp.symbol - # Check if inp has attached STF logical data (Warp array) - elif hasattr(inp, "_stf_ld"): - is_logical_data = True - if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: - symbol = inp._stf_ld.symbol - # Fallback to _name for Warp arrays - elif hasattr(inp, "_name") and inp._name: - symbol = inp._name - - if is_logical_data: + # Check if input is STF dependency wrapper + if isinstance(inp, STFDependency): + # Extract STF dependency information + stf_dependencies.append({ + 'index': i, + 'array': inp.array, + 'mode': inp.mode, + 'data_place': inp.data_place + }) + + # Get symbol for display (STF deps ALWAYS have _stf_ld) + symbol = None + if hasattr(inp.array._stf_ld, 'symbol') and inp.array._stf_ld.symbol: + symbol = inp.array._stf_ld.symbol + elif hasattr(inp.array, '_name') and inp.array._name: + symbol = inp.array._name + if symbol: - print(f" [{i}]: '{symbol}' [logical_data]") + print(f" [{i}]: '{symbol}' [{inp.mode}] [stf_dep]") else: - print(f" [{i}]: logical_data") + print(f" [{i}]: logical_data [{inp.mode}] [stf_dep]") + + # Add unwrapped array to processed inputs + processed_inputs.append(inp.array) + else: - # Regular arguments (scalars, etc.) - if hasattr(inp, "shape"): # Array-like but not logical data - print(f" [{i}]: {type(inp).__name__}") - else: # Scalar value - print(f" [{i}]: {inp}") + # Regular input - detect logical data for display + is_logical_data = False + symbol = None + + # Check if inp is directly STF logical data + if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): + is_logical_data = True + if hasattr(inp, "symbol") and inp.symbol: + symbol = inp.symbol + # Check if inp has attached STF logical data (Warp array) + elif hasattr(inp, "_stf_ld"): + is_logical_data = True + if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: + symbol = inp._stf_ld.symbol + # Fallback to _name for Warp arrays + elif hasattr(inp, "_name") and inp._name: + symbol = inp._name + + if is_logical_data: + if symbol: + print(f" [{i}]: '{symbol}' [logical_data]") + else: + print(f" [{i}]: logical_data") + else: + # Regular arguments (scalars, etc.) + if hasattr(inp, "shape"): # Array-like but not logical data + print(f" [{i}]: {type(inp).__name__}") + else: # Scalar value + print(f" [{i}]: {inp}") + + processed_inputs.append(inp) + print(" ]") else: print(f" inputs = {inputs}") + # Show STF dependency summary + if stf_dependencies: + print(" stf_deps = [") + for dep in stf_dependencies: + # All STF dependencies are guaranteed to have _stf_ld + symbol = None + if hasattr(dep['array']._stf_ld, 'symbol') and dep['array']._stf_ld.symbol: + symbol = dep['array']._stf_ld.symbol + elif hasattr(dep['array'], '_name') and dep['array']._name: + symbol = dep['array']._name + + if symbol: + print(f" {dep['mode'].upper()}: '{symbol}'") + else: + print(f" {dep['mode'].upper()}: logical_data") + print(" ]") + print(f" kwargs = {kwargs}") - # just forward to warp for now + # Launch with processed (unwrapped) inputs return wp.launch( kernel, dim=dim, - inputs=inputs, + inputs=processed_inputs, stream=stream, **kwargs, ) @@ -147,11 +197,63 @@ class _stf: wp.stf = _stf() +# STF dependency wrapper class +class STFDependency: + """Wrapper for STF task dependencies with access mode specification.""" + def __init__(self, array, mode, data_place=None): + # CRITICAL: STF dependencies MUST have logical data attached + if not hasattr(array, '_stf_ld'): + raise ValueError( + f"STF dependency requires array with logical data (_stf_ld). " + f"Array {type(array).__name__} does not have STF logical data. " + f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)" + ) + + self.array = array + self.mode = mode # 'read', 'write', 'rw' + self.data_place = data_place + + def __repr__(self): + symbol = None + if hasattr(self.array._stf_ld, 'symbol'): + symbol = self.array._stf_ld.symbol + elif hasattr(self.array, '_name'): + symbol = self.array._name + + if symbol: + return f"STFDependency('{symbol}', {self.mode})" + else: + return f"STFDependency({type(self.array).__name__}, {self.mode})" + +def stf_read(array, data_place=None): + """Mark array as read-only dependency for STF task. + + REQUIRES: array must have _stf_ld (STF logical data) attached. + """ + return STFDependency(array, 'read', data_place) + +def stf_write(array, data_place=None): + """Mark array as write-only dependency for STF task. + + REQUIRES: array must have _stf_ld (STF logical data) attached. + """ + return STFDependency(array, 'write', data_place) + +def stf_rw(array, data_place=None): + """Mark array as read-write dependency for STF task. + + REQUIRES: array must have _stf_ld (STF logical data) attached. + """ + return STFDependency(array, 'rw', data_place) + wp.stf.kernel = stf_kernel wp.stf.launch = stf_launch +wp.stf.read = stf_read +wp.stf.write = stf_write +wp.stf.rw = stf_rw -grid_width = wp.constant(256) -grid_height = wp.constant(128) +grid_width = wp.constant(256*4) +grid_height = wp.constant(128*4) @wp.func @@ -307,7 +409,7 @@ def __init__(self): fps = 60 self.frame_dt = 1.0 / fps self.sim_substeps = 2 - self.iterations = 100 # Number of pressure iterations + self.iterations = 4 #100 # Number of pressure iterations self.sim_dt = self.frame_dt / self.sim_substeps self.sim_time = 0.0 @@ -384,31 +486,46 @@ def step(self): vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed) # update emitters - wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel]) + wp.stf.launch(init, dim=shape, inputs=[ + wp.stf.write(self.rho0), # Only writes: rho[i, j] = 1.0 + wp.stf.write(self.u0), # Only writes: u[i, j] = dir + 5, vel + ]) # force integrate - wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt]) - wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div]) + wp.stf.launch(integrate, dim=shape, inputs=[ + wp.stf.rw(self.u0), # Read then write: u[i, j] = u[i, j] + dt * f_g + wp.stf.rw(self.rho0), # Read then write: rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt) + dt + ]) + wp.stf.launch(divergence, dim=shape, inputs=[ + wp.stf.read(self.u0), # Only reads: u[i + 1, j], u[i, j], etc. + wp.stf.write(self.div) # Only writes: div[i, j] = dx + dy + ]) # pressure solve self.p0.zero_() self.p1.zero_() - # if self.use_cuda_graph: - # wp.capture_launch(self.graph) - # else: - # self.pressure_iterations() - self.pressure_iterations() + if self.use_cuda_graph: + wp.capture_launch(self.graph) + else: + self.pressure_iterations() # velocity update - wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0]) + wp.stf.launch(pressure_apply, dim=shape, inputs=[ + wp.stf.read(self.p0), # Only reads: p[i + 1, j] - p[i - 1, j], etc. + wp.stf.rw(self.u0) # Read then write: u[i, j] = u[i, j] - f_p + ]) # semi-Lagrangian advection - wp.stf.launch( - advect, - dim=shape, - inputs=[self.u0, self.u1, self.rho0, self.rho1, dt], - ) + wp.stf.launch(advect, dim=shape, inputs=[ + wp.stf.read(self.u0), # Only reads: u0[i, j] and sample_vel(u0, ...) + wp.stf.write(self.u1), # Only writes: u1[i, j] = sample_vel(...) + wp.stf.read(self.rho0), # Only reads: sample_float(rho0, ...) + wp.stf.write(self.rho1), # Only writes: rho1[i, j] = sample_float(...) + dt + ]) # swap buffers (self.u0, self.u1) = (self.u1, self.u0) @@ -418,9 +535,11 @@ def step(self): def pressure_iterations(self): for _ in range(self.iterations): - wp.stf.launch( - pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div] - ) + wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[ + wp.stf.read(self.p0), # Only reads: lookup_float(p0, ...) + wp.stf.write(self.p1), # Only writes: p1[i, j] = err * 0.25 + wp.stf.read(self.div) # Only reads: div[i, j] + ]) # swap pressure fields (self.p0, self.p1) = (self.p1, self.p0) From 9f31b1e82500d5a60ea13062fe631f5c42edf02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 22:19:50 +0200 Subject: [PATCH 185/221] cleanups --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 250 ++++++++++++------ 1 file changed, 170 insertions(+), 80 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index 56c7800e1ea..5b6bd40464d 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -100,28 +100,30 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): # Check if input is STF dependency wrapper if isinstance(inp, STFDependency): # Extract STF dependency information - stf_dependencies.append({ - 'index': i, - 'array': inp.array, - 'mode': inp.mode, - 'data_place': inp.data_place - }) - + stf_dependencies.append( + { + "index": i, + "array": inp.array, + "mode": inp.mode, + "data_place": inp.data_place, + } + ) + # Get symbol for display (STF deps ALWAYS have _stf_ld) symbol = None - if hasattr(inp.array._stf_ld, 'symbol') and inp.array._stf_ld.symbol: + if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol: symbol = inp.array._stf_ld.symbol - elif hasattr(inp.array, '_name') and inp.array._name: + elif hasattr(inp.array, "_name") and inp.array._name: symbol = inp.array._name - + if symbol: print(f" [{i}]: '{symbol}' [{inp.mode}] [stf_dep]") else: print(f" [{i}]: logical_data [{inp.mode}] [stf_dep]") - + # Add unwrapped array to processed inputs processed_inputs.append(inp.array) - + else: # Regular input - detect logical data for display is_logical_data = False @@ -152,9 +154,9 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): print(f" [{i}]: {type(inp).__name__}") else: # Scalar value print(f" [{i}]: {inp}") - + processed_inputs.append(inp) - + print(" ]") else: print(f" inputs = {inputs}") @@ -165,11 +167,11 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): for dep in stf_dependencies: # All STF dependencies are guaranteed to have _stf_ld symbol = None - if hasattr(dep['array']._stf_ld, 'symbol') and dep['array']._stf_ld.symbol: - symbol = dep['array']._stf_ld.symbol - elif hasattr(dep['array'], '_name') and dep['array']._name: - symbol = dep['array']._name - + if hasattr(dep["array"]._stf_ld, "symbol") and dep["array"]._stf_ld.symbol: + symbol = dep["array"]._stf_ld.symbol + elif hasattr(dep["array"], "_name") and dep["array"]._name: + symbol = dep["array"]._name + if symbol: print(f" {dep['mode'].upper()}: '{symbol}'") else: @@ -178,14 +180,68 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): print(f" kwargs = {kwargs}") - # Launch with processed (unwrapped) inputs - return wp.launch( - kernel, - dim=dim, - inputs=processed_inputs, - stream=stream, - **kwargs, - ) + # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch + if not stf_dependencies: + raise ValueError( + "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). " + f"Found {len(inputs)} inputs but none are STF dependencies. " + "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)." + ) + + # STF Task-based launch with automatic dependency management + print(" → Creating STF task with dependencies") + + # Extract the STF context from the first dependency + first_dep = stf_dependencies[0] + stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle() + + # Create STF dependency objects for the task + stf_task_deps = [] + for dep in stf_dependencies: + stf_ld = dep["array"]._stf_ld + if dep["mode"] == "read": + stf_task_deps.append(stf_ld.read()) + elif dep["mode"] == "write": + stf_task_deps.append(stf_ld.write()) + elif dep["mode"] == "rw": + stf_task_deps.append(stf_ld.rw()) + + # Create and execute STF task + with stf_ctx.task(*stf_task_deps) as stf_task: + # Get raw CUDA stream pointer from STF task + stf_stream_ptr = stf_task.stream_ptr() + + print(f" → STF task stream ptr: {stf_stream_ptr}") + print(" → Launching kernel within STF task context") + + # Wrap STF stream via PyTorch ExternalStream -> Warp conversion + print(f" → STF task stream ptr: {stf_stream_ptr}") + print(" → Creating PyTorch ExternalStream from STF stream") + + # Import PyTorch for stream conversion + import torch + + # Get the current CUDA device for PyTorch + warp_device = wp.get_device() + device_id = warp_device.ordinal # Get device number (e.g., 0 for cuda:0) + torch_device = torch.device(f"cuda:{device_id}") + + # Create PyTorch ExternalStream from STF stream pointer with explicit device + torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device) + + # Convert PyTorch stream to Warp stream + warp_stream = wp.stream_from_torch(torch_stream) + + print(f" → Successfully wrapped STF stream via PyTorch: {warp_stream}") + + # Launch with properly wrapped STF stream + return wp.launch( + kernel, + dim=dim, + inputs=processed_inputs, + stream=warp_stream, + **kwargs, + ) # put it under wp.stf @@ -197,63 +253,68 @@ class _stf: wp.stf = _stf() -# STF dependency wrapper class +# STF dependency wrapper class class STFDependency: """Wrapper for STF task dependencies with access mode specification.""" + def __init__(self, array, mode, data_place=None): # CRITICAL: STF dependencies MUST have logical data attached - if not hasattr(array, '_stf_ld'): + if not hasattr(array, "_stf_ld"): raise ValueError( f"STF dependency requires array with logical data (_stf_ld). " f"Array {type(array).__name__} does not have STF logical data. " f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)" ) - + self.array = array self.mode = mode # 'read', 'write', 'rw' self.data_place = data_place - + def __repr__(self): symbol = None - if hasattr(self.array._stf_ld, 'symbol'): + if hasattr(self.array._stf_ld, "symbol"): symbol = self.array._stf_ld.symbol - elif hasattr(self.array, '_name'): + elif hasattr(self.array, "_name"): symbol = self.array._name - + if symbol: return f"STFDependency('{symbol}', {self.mode})" else: return f"STFDependency({type(self.array).__name__}, {self.mode})" + def stf_read(array, data_place=None): """Mark array as read-only dependency for STF task. - + REQUIRES: array must have _stf_ld (STF logical data) attached. """ - return STFDependency(array, 'read', data_place) + return STFDependency(array, "read", data_place) + def stf_write(array, data_place=None): """Mark array as write-only dependency for STF task. - + REQUIRES: array must have _stf_ld (STF logical data) attached. - """ - return STFDependency(array, 'write', data_place) + """ + return STFDependency(array, "write", data_place) + def stf_rw(array, data_place=None): """Mark array as read-write dependency for STF task. - + REQUIRES: array must have _stf_ld (STF logical data) attached. """ - return STFDependency(array, 'rw', data_place) + return STFDependency(array, "rw", data_place) + wp.stf.kernel = stf_kernel wp.stf.launch = stf_launch wp.stf.read = stf_read -wp.stf.write = stf_write +wp.stf.write = stf_write wp.stf.rw = stf_rw -grid_width = wp.constant(256*4) -grid_height = wp.constant(128*4) +grid_width = wp.constant(256 * 4) +grid_height = wp.constant(128 * 4) @wp.func @@ -408,12 +469,17 @@ class Example: def __init__(self): fps = 60 self.frame_dt = 1.0 / fps - self.sim_substeps = 2 - self.iterations = 4 #100 # Number of pressure iterations + self.sim_substeps = 10 + self.iterations = 100 # Number of pressure iterations self.sim_dt = self.frame_dt / self.sim_substeps self.sim_time = 0.0 - self._stf_ctx = cudastf.context() + # Create STF context for task-based scheduling + # This enables automatic dependency management and stream orchestration + import torch + + torch.cuda.init() + self._stf_ctx = cudastf.context() # use_graph=True) shape = (grid_width, grid_height) @@ -439,8 +505,6 @@ def __init__(self): self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place) self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place) - # vec2 arrays - STF now automatically handles vector type flattening - # Store STF logical data consistently with other arrays self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place) self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place) print( @@ -469,7 +533,7 @@ def __init__(self): self.div._name = "div" # capture pressure solve as a CUDA graph - self.use_cuda_graph = wp.get_device().is_cuda + self.use_cuda_graph = False # wp.get_device().is_cuda if self.use_cuda_graph: with wp.ScopedCapture() as capture: self.pressure_iterations() @@ -486,24 +550,38 @@ def step(self): vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed) # update emitters - wp.stf.launch(init, dim=shape, inputs=[ - wp.stf.write(self.rho0), # Only writes: rho[i, j] = 1.0 - wp.stf.write(self.u0), # Only writes: u[i, j] = dir - 5, vel - ]) + wp.stf.launch( + init, + dim=shape, + inputs=[ + wp.stf.write(self.rho0), + wp.stf.write(self.u0), + 5, + vel, + ], + ) # force integrate - wp.stf.launch(integrate, dim=shape, inputs=[ - wp.stf.rw(self.u0), # Read then write: u[i, j] = u[i, j] + dt * f_g - wp.stf.rw(self.rho0), # Read then write: rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt) - dt - ]) - wp.stf.launch(divergence, dim=shape, inputs=[ - wp.stf.read(self.u0), # Only reads: u[i + 1, j], u[i, j], etc. - wp.stf.write(self.div) # Only writes: div[i, j] = dx + dy - ]) + wp.stf.launch( + integrate, + dim=shape, + inputs=[ + wp.stf.rw(self.u0), + wp.stf.rw(self.rho0), + dt, + ], + ) + wp.stf.launch( + divergence, + dim=shape, + inputs=[ + wp.stf.read(self.u0), + wp.stf.write(self.div), + ], + ) # pressure solve + # TODO tasks ? self.p0.zero_() self.p1.zero_() @@ -513,19 +591,27 @@ def step(self): self.pressure_iterations() # velocity update - wp.stf.launch(pressure_apply, dim=shape, inputs=[ - wp.stf.read(self.p0), # Only reads: p[i + 1, j] - p[i - 1, j], etc. - wp.stf.rw(self.u0) # Read then write: u[i, j] = u[i, j] - f_p - ]) + wp.stf.launch( + pressure_apply, + dim=shape, + inputs=[ + wp.stf.read(self.p0), + wp.stf.rw(self.u0), + ], + ) # semi-Lagrangian advection - wp.stf.launch(advect, dim=shape, inputs=[ - wp.stf.read(self.u0), # Only reads: u0[i, j] and sample_vel(u0, ...) - wp.stf.write(self.u1), # Only writes: u1[i, j] = sample_vel(...) - wp.stf.read(self.rho0), # Only reads: sample_float(rho0, ...) - wp.stf.write(self.rho1), # Only writes: rho1[i, j] = sample_float(...) - dt - ]) + wp.stf.launch( + advect, + dim=shape, + inputs=[ + wp.stf.read(self.u0), + wp.stf.write(self.u1), + wp.stf.read(self.rho0), + wp.stf.write(self.rho1), + dt, + ], + ) # swap buffers (self.u0, self.u1) = (self.u1, self.u0) @@ -535,11 +621,15 @@ def step(self): def pressure_iterations(self): for _ in range(self.iterations): - wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[ - wp.stf.read(self.p0), # Only reads: lookup_float(p0, ...) - wp.stf.write(self.p1), # Only writes: p1[i, j] = err * 0.25 - wp.stf.read(self.div) # Only reads: div[i, j] - ]) + wp.stf.launch( + pressure_solve, + dim=self.p0.shape, + inputs=[ + wp.stf.read(self.p0), + wp.stf.write(self.p1), + wp.stf.read(self.div), + ], + ) # swap pressure fields (self.p0, self.p1) = (self.p1, self.p0) From c0bb0704cc9ce9c25ac8bcf2fa61a416704d6696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 25 Sep 2025 22:34:19 +0200 Subject: [PATCH 186/221] Save WIP --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 175 ++++++++++-------- 1 file changed, 102 insertions(+), 73 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index 5b6bd40464d..d3cb1f4e358 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -85,17 +85,12 @@ def _stf_call(*args, dim=None, stream=None, **kwargs): def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): - print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") - print(f" dim = {dim}") - print(f" stream = {stream}") - # Process STF dependencies and extract arrays for wp.launch processed_inputs = [] stf_dependencies = [] - # Enhanced input display with STF dependency detection + # Process inputs to separate STF dependencies from regular arguments if inputs: - print(" inputs = [") for i, inp in enumerate(inputs): # Check if input is STF dependency wrapper if isinstance(inp, STFDependency): @@ -108,7 +103,104 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): "data_place": inp.data_place, } ) + # Add unwrapped array to processed inputs + processed_inputs.append(inp.array) + else: + processed_inputs.append(inp) + + # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch + if not stf_dependencies: + raise ValueError( + "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). " + f"Found {len(inputs)} inputs but none are STF dependencies. " + "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)." + ) + + # Print tracing information (controlled by STF_TRACE_ENABLED) + _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies) + + # Extract the STF context from the first dependency + first_dep = stf_dependencies[0] + stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle() + + # Create STF dependency objects for the task + stf_task_deps = [] + for dep in stf_dependencies: + stf_ld = dep["array"]._stf_ld + if dep["mode"] == "read": + stf_task_deps.append(stf_ld.read()) + elif dep["mode"] == "write": + stf_task_deps.append(stf_ld.write()) + elif dep["mode"] == "rw": + stf_task_deps.append(stf_ld.rw()) + + # Create and execute STF task + with stf_ctx.task(*stf_task_deps) as stf_task: + # Get raw CUDA stream pointer from STF task + stf_stream_ptr = stf_task.stream_ptr() + + # Import PyTorch for stream conversion + import torch + + # Get the current CUDA device for PyTorch + warp_device = wp.get_device() + device_id = warp_device.ordinal # Get device number (e.g., 0 for cuda:0) + torch_device = torch.device(f"cuda:{device_id}") + + # Create PyTorch ExternalStream from STF stream pointer with explicit device + torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device) + + # Convert PyTorch stream to Warp stream + warp_stream = wp.stream_from_torch(torch_stream) + + # Launch with properly wrapped STF stream + return wp.launch( + kernel, + dim=dim, + inputs=processed_inputs, + stream=warp_stream, + **kwargs, + ) + + +# STF tracing configuration +STF_TRACE_ENABLED = True # Set to False to disable STF tracing + + +def set_stf_trace(enabled: bool): + """Enable or disable STF tracing output. + + Args: + enabled: True to enable tracing, False to disable + """ + global STF_TRACE_ENABLED + STF_TRACE_ENABLED = enabled + + +def get_stf_trace() -> bool: + """Get current STF tracing state. + + Returns: + True if tracing is enabled, False otherwise + """ + return STF_TRACE_ENABLED + +def _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies): + """Print STF launch tracing information if enabled.""" + if not STF_TRACE_ENABLED: + return + + print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") + print(f" dim = {dim}") + print(f" stream = {stream}") + + # Enhanced input display with STF dependency detection + if inputs: + print(" inputs = [") + for i, inp in enumerate(inputs): + # Check if input is STF dependency wrapper + if isinstance(inp, STFDependency): # Get symbol for display (STF deps ALWAYS have _stf_ld) symbol = None if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol: @@ -121,9 +213,6 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): else: print(f" [{i}]: logical_data [{inp.mode}] [stf_dep]") - # Add unwrapped array to processed inputs - processed_inputs.append(inp.array) - else: # Regular input - detect logical data for display is_logical_data = False @@ -155,8 +244,6 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): else: # Scalar value print(f" [{i}]: {inp}") - processed_inputs.append(inp) - print(" ]") else: print(f" inputs = {inputs}") @@ -179,70 +266,8 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): print(" ]") print(f" kwargs = {kwargs}") - - # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch - if not stf_dependencies: - raise ValueError( - "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). " - f"Found {len(inputs)} inputs but none are STF dependencies. " - "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)." - ) - - # STF Task-based launch with automatic dependency management print(" → Creating STF task with dependencies") - # Extract the STF context from the first dependency - first_dep = stf_dependencies[0] - stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle() - - # Create STF dependency objects for the task - stf_task_deps = [] - for dep in stf_dependencies: - stf_ld = dep["array"]._stf_ld - if dep["mode"] == "read": - stf_task_deps.append(stf_ld.read()) - elif dep["mode"] == "write": - stf_task_deps.append(stf_ld.write()) - elif dep["mode"] == "rw": - stf_task_deps.append(stf_ld.rw()) - - # Create and execute STF task - with stf_ctx.task(*stf_task_deps) as stf_task: - # Get raw CUDA stream pointer from STF task - stf_stream_ptr = stf_task.stream_ptr() - - print(f" → STF task stream ptr: {stf_stream_ptr}") - print(" → Launching kernel within STF task context") - - # Wrap STF stream via PyTorch ExternalStream -> Warp conversion - print(f" → STF task stream ptr: {stf_stream_ptr}") - print(" → Creating PyTorch ExternalStream from STF stream") - - # Import PyTorch for stream conversion - import torch - - # Get the current CUDA device for PyTorch - warp_device = wp.get_device() - device_id = warp_device.ordinal # Get device number (e.g., 0 for cuda:0) - torch_device = torch.device(f"cuda:{device_id}") - - # Create PyTorch ExternalStream from STF stream pointer with explicit device - torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device) - - # Convert PyTorch stream to Warp stream - warp_stream = wp.stream_from_torch(torch_stream) - - print(f" → Successfully wrapped STF stream via PyTorch: {warp_stream}") - - # Launch with properly wrapped STF stream - return wp.launch( - kernel, - dim=dim, - inputs=processed_inputs, - stream=warp_stream, - **kwargs, - ) - # put it under wp.stf if not hasattr(wp, "stf"): @@ -313,6 +338,10 @@ def stf_rw(array, data_place=None): wp.stf.write = stf_write wp.stf.rw = stf_rw +# STF tracing control functions +wp.stf.set_trace = set_stf_trace +wp.stf.get_trace = get_stf_trace + grid_width = wp.constant(256 * 4) grid_height = wp.constant(128 * 4) From 76d78b48db4642e829502798ca46159e1f2b6252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 08:41:25 +0200 Subject: [PATCH 187/221] Adopt to new python hierarchy --- python/cuda_cccl/cuda/{cccl/experimental => }/stf/__init__.py | 0 .../cuda/{cccl/experimental => }/stf/_adapters/numba_bridge.py | 0 .../cuda/{cccl/experimental => }/stf/_adapters/numba_utils.py | 0 .../cuda/{cccl/experimental => }/stf/_adapters/torch_bridge.py | 0 .../cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings.py | 0 .../cuda/{cccl/experimental => }/stf/_stf_bindings_impl.pyx | 0 python/cuda_cccl/cuda/{cccl/experimental => }/stf/decorator.py | 0 python/cuda_cccl/pyproject.toml | 2 +- 8 files changed, 1 insertion(+), 1 deletion(-) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/__init__.py (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/numba_bridge.py (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/numba_utils.py (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/torch_bridge.py (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings.py (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings_impl.pyx (100%) rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/decorator.py (100%) diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/stf/__init__.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py rename to python/cuda_cccl/cuda/stf/__init__.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/numba_bridge.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py rename to python/cuda_cccl/cuda/stf/_adapters/numba_bridge.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/stf/_adapters/numba_utils.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py rename to python/cuda_cccl/cuda/stf/_adapters/numba_utils.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py rename to python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py rename to python/cuda_cccl/cuda/stf/_stf_bindings.py diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx rename to python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py similarity index 100% rename from python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py rename to python/cuda_cccl/cuda/stf/decorator.py diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 6fdb0ff854f..14561449098 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.2.3", "cuda-core", - "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory", # TODO: remove this once numba-cuda 0.19.2 is released + "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory", ] dynamic = ["version"] From 0c11b6a2f839c4ba74d506ed4383e4b5f3d589a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 09:27:19 +0200 Subject: [PATCH 188/221] fix errors in a previous merge --- python/cuda_cccl/CMakeLists.txt | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 6d81617cf46..b5c1c511324 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -65,6 +65,7 @@ install( file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl") file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl") +# Install version-specific binaries install( TARGETS cccl.c.experimental.stf DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl @@ -117,8 +118,24 @@ set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/compute/_bindings_impl.pyx") set(_generated_extension_src "${cuda_cccl_BINARY_DIR}/_bindings_impl.c") set(_depfile "${cuda_cccl_BINARY_DIR}/_bindings_impl.c.dep") +# Custom Cython compilation command for version-specific target +add_custom_command( + OUTPUT "${_generated_extension_src}" + COMMAND "${Python3_EXECUTABLE}" -m cython + ARGS ${CYTHON_FLAGS_LIST} "${pyx_source_file}" --output-file "${_generated_extension_src}" + DEPENDS "${pyx_source_file}" + DEPFILE "${_depfile}" + COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" +) + +set_source_files_properties("${_generated_extension_src}" PROPERTIES GENERATED TRUE) +add_custom_target(cythonize_bindings_impl ALL + DEPENDS "${_generated_extension_src}" +) + + message(STATUS "STF Using Cython ${CYTHON_VERSION}") -set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx") +set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx") set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c") set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep") add_custom_command( @@ -134,6 +151,11 @@ add_custom_target(cythonize_stf_bindings_impl ALL DEPENDS "${_stf_generated_extension_src}" ) +Python3_add_library(_bindings_impl MODULE WITH_SOABI "${_generated_extension_src}") +add_dependencies(_bindings_impl cythonize_bindings_impl) +target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver) +set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") + Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}") add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl) target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver) From f6c50e1e743e3943fb7e22ca10a74e0be858a8a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 09:37:00 +0200 Subject: [PATCH 189/221] cuda.cccl.experimental.stf => cuda.stf --- python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 6 +++--- python/cuda_cccl/cuda/stf/decorator.py | 2 +- python/cuda_cccl/tests/stf/example_fluid_warp.py | 2 +- python/cuda_cccl/tests/stf/test_context.py | 2 +- python/cuda_cccl/tests/stf/test_decorator.py | 2 +- python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 2 +- python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py | 2 +- python/cuda_cccl/tests/stf/test_fhe.py | 2 +- python/cuda_cccl/tests/stf/test_fhe_decorator.py | 2 +- python/cuda_cccl/tests/stf/test_numba.py | 2 +- python/cuda_cccl/tests/stf/test_pytorch.py | 2 +- python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index f8551b83da8..099f6869c46 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -502,7 +502,7 @@ cdef class task: def get_arg_numba(self, index): cai = self.get_arg_cai(index) try: - from cuda.cccl.experimental.stf._adapters.numba_bridge import cai_to_numba + from cuda.stf._adapters.numba_bridge import cai_to_numba except Exception as e: raise RuntimeError("numba support is not available") from e return cai_to_numba(cai) @@ -516,7 +516,7 @@ cdef class task: def get_arg_as_tensor(self, index): cai = self.get_arg_cai(index) try: - from cuda.cccl.experimental.stf._adapters.torch_bridge import cai_to_torch + from cuda.stf._adapters.torch_bridge import cai_to_torch except Exception as e: raise RuntimeError("PyTorch support is not available") from e return cai_to_torch(cai) @@ -755,7 +755,7 @@ cdef class context: # Initialize with the specified value using NUMBA # The numba code already handles None properly by calling ld.write() without data place try: - from cuda.cccl.experimental.stf._adapters.numba_utils import init_logical_data + from cuda.stf._adapters.numba_utils import init_logical_data init_logical_data(self, ld, fill_value, where, exec_place) except ImportError as e: raise RuntimeError("NUMBA support is not available for logical_data_full") from e diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py index c7179d2a6fc..50575229a0e 100644 --- a/python/cuda_cccl/cuda/stf/decorator.py +++ b/python/cuda_cccl/cuda/stf/decorator.py @@ -1,7 +1,7 @@ import numba from numba import cuda -from cuda.cccl.experimental.stf import context, dep, exec_place +from cuda.stf import context, dep, exec_place numba.config.CUDA_ENABLE_PYNVJITLINK = 1 diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index d3cb1f4e358..66fc1f4bc7c 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -26,7 +26,7 @@ import warp as wp import warp.render -import cuda.cccl.experimental.stf as cudastf +import cuda.stf as cudastf def stf_kernel(pyfunc): diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index b306cf3571f..f4a583de351 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -4,7 +4,7 @@ import numpy as np -from cuda.cccl.experimental.stf._stf_bindings import context, read, rw +from cuda.stf._stf_bindings import context, read, rw def test_ctx(): diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 49605ced878..7920f9eb83d 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -6,7 +6,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.stf as cudastf @cudastf.jit diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index b5209c9d04c..a64845055ce 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -6,7 +6,7 @@ import torch import torch.cuda as tc -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.stf._stf_bindings import ( context, ) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index 659fc43bfd4..24af8361162 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -5,7 +5,7 @@ import numpy as np import torch -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.stf._stf_bindings import ( context, ) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 7331eeba658..fdd2b1e0259 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.stf as cudastf class Plaintext: diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 0bde583bafe..d20b7280c75 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -10,7 +10,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.stf as cudastf class Plaintext: diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 72b9609276f..6d46cdf2829 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -11,7 +11,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( +from cuda.stf._stf_bindings import ( context, data_place, exec_place, diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 4c0d180b407..001a7002d08 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -12,7 +12,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -from cuda.cccl.experimental.stf._stf_bindings import ( # noqa: E402 +from cuda.stf._stf_bindings import ( # noqa: E402 context, rw, ) diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index ebfd71de46e..16d0ec0e055 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -5,7 +5,7 @@ numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.cccl.experimental.stf as cudastf +import cuda.stf as cudastf @cudastf.jit From efea184143dfa88f077c6b2778574391a283d1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 10:02:04 +0200 Subject: [PATCH 190/221] Misc stf python tests improvements --- python/cuda_cccl/cuda/stf/decorator.py | 4 +- python/cuda_cccl/tests/stf/test_decorator.py | 16 +++--- python/cuda_cccl/tests/stf/test_fhe.py | 9 ++-- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 5 +- python/cuda_cccl/tests/stf/test_numba.py | 49 +++++++++---------- .../tests/stf/test_stencil_decorator.py | 4 +- 6 files changed, 42 insertions(+), 45 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py index 50575229a0e..65af9734f44 100644 --- a/python/cuda_cccl/cuda/stf/decorator.py +++ b/python/cuda_cccl/cuda/stf/decorator.py @@ -3,7 +3,7 @@ from cuda.stf import context, dep, exec_place -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 class stf_kernel_decorator: @@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs): for i, a in enumerate(args): # print(f"got one arg {a} is dep ? {isinstance(a, dep)}") if isinstance(a, dep): - if ctx == None: + if ctx is None: ld = a.get_ld() # This context will be used in the __call__ method itself # so we can create a temporary object from the handle diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 7920f9eb83d..16bc2539538 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -3,20 +3,20 @@ import pytest from numba import cuda -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 -numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +import cuda.stf as stf -import cuda.stf as cudastf +numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -@cudastf.jit +@stf.jit def axpy(a, x, y): i = cuda.grid(1) if i < x.size: y[i] = a * x[i] + y[i] -@cudastf.jit +@stf.jit def scale(a, x): i = cuda.grid(1) if i < x.size: @@ -27,18 +27,18 @@ def scale(a, x): def test_decorator(use_graph): X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) - ctx = cudastf.context(use_graph=use_graph) + ctx = stf.context(use_graph=use_graph) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) scale[32, 64](2.0, lX.rw()) axpy[32, 64](2.0, lX.read(), lY.rw()) - axpy[32, 64, cudastf.exec_place.device(0)]( + axpy[32, 64, stf.exec_place.device(0)]( 2.0, lX.read(), lZ.rw() ) # explicit exec place axpy[32, 64]( - 2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)) + 2.0, lY.read(), lZ.rw(stf.data_place.device(0)) ) # per-dep placement override diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index fdd2b1e0259..d0bbdd3d596 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -7,11 +7,11 @@ import numba from numba import cuda +import cuda.stf as stf + numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.stf as cudastf - class Plaintext: # Initialize from actual values, or from a logical data @@ -34,9 +34,10 @@ def encrypt(self) -> "Ciphertext": def print_values(self): with ctx.task( - cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) + stf.exec_place.host(), self.l.read(stf.data_place.managed()) ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() hvalues = t.numba_arguments() print([v for v in hvalues]) @@ -145,7 +146,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: def test_fhe(): """Test Fully Homomorphic Encryption (FHE) example with logical operations.""" global ctx # Make ctx accessible to the classes - ctx = cudastf.context(use_graph=False) + ctx = stf.context(use_graph=False) vA = [3, 3, 2, 2, 17] pA = Plaintext(ctx, vA) diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index d20b7280c75..571ff8013ea 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -7,11 +7,11 @@ import numba from numba import cuda +import cuda.stf as cudastf + numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.stf as cudastf - class Plaintext: # Initialize from actual values, or from a logical data @@ -37,6 +37,7 @@ def print_values(self): cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed()) ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() hvalues = t.numba_arguments() print([v for v in hvalues]) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index 6d46cdf2829..d15ae639bda 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -8,17 +8,10 @@ import pytest from numba import cuda -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 -numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +import cuda.stf as stf -from cuda.stf._stf_bindings import ( - context, - data_place, - exec_place, - read, - rw, - write, -) +numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 @cuda.jit @@ -38,9 +31,9 @@ def scale(a, x): # One test with a single kernel in a CUDA graph def test_numba_graph(): X = np.ones(16, dtype=np.float32) - ctx = context(use_graph=True) + ctx = stf.context(use_graph=True) lX = ctx.logical_data(X) - with ctx.task(rw(lX)) as t: + with ctx.task(lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.numba_arguments() scale[32, 64, nb_stream](2.0, dX) @@ -61,30 +54,30 @@ def test_numba(): Y = np.ones(n, dtype=np.float32) Z = np.ones(n, dtype=np.float32) - ctx = context() + ctx = stf.context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - with ctx.task(rw(lX)) as t: + with ctx.task(lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.numba_arguments() # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) - with ctx.task(read(lX), rw(lY)) as t: + with ctx.task(lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) print(nb_stream) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) - with ctx.task(read(lX), rw(lZ)) as t: + with ctx.task(lX.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX, dZ = t.numba_arguments() axpy[32, 64, nb_stream](2.0, dX, dZ) - with ctx.task(read(lY), rw(lZ)) as t: + with ctx.task(lY.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY, dZ = t.numba_arguments() axpy[32, 64, nb_stream](2.0, dY, dZ) @@ -170,11 +163,11 @@ def test_numba2d(): u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) u_out = np.zeros_like(u) - ctx = context() + ctx = stf.context() lu = ctx.logical_data(u) lu_out = ctx.logical_data(u_out) - with ctx.task(read(lu), write(lu_out)) as t: + with ctx.task(lu.read(), lu_out.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) du = t.get_arg_numba(0) du_out = t.get_arg_numba(1) @@ -213,18 +206,18 @@ def test_numba_exec_place(): Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) - ctx = context() + ctx = stf.context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) - with ctx.task(exec_place.device(0), lX.rw()) as t: + with ctx.task(stf.exec_place.device(0), lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) # dX = t.get_arg_numba(0) dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) scale[32, 64, nb_stream](2.0, dX) - with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t: + with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) print(nb_stream) dX = t.get_arg_numba(0) @@ -232,14 +225,16 @@ def test_numba_exec_place(): axpy[32, 64, nb_stream](2.0, dX, dY) with ctx.task( - exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed()) + stf.exec_place.device(0), + lX.read(stf.data_place.managed()), + lZ.rw(stf.data_place.managed()), ) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) - with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t: + with ctx.task(stf.exec_place.device(0), lY.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) @@ -255,7 +250,7 @@ def test_numba_places(): Y = np.ones(16, dtype=np.float32) Z = np.ones(16, dtype=np.float32) - ctx = context() + ctx = stf.context() lX = ctx.logical_data(X) lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) @@ -272,13 +267,13 @@ def test_numba_places(): dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) - with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t: + with ctx.task(stf.exec_place.device(1), lX.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dZ = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dZ) - with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t: + with ctx.task(lY.read(), lZ.rw(stf.data_place.device(1))) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY = t.get_arg_numba(0) dZ = t.get_arg_numba(1) diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index 16d0ec0e055..b4155c8b46b 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -2,11 +2,11 @@ import numpy as np from numba import cuda +import cuda.stf as cudastf + numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -import cuda.stf as cudastf - @cudastf.jit def laplacian_5pt_kernel(u_in, u_out, dx, dy): From c0d3592426a91df9bba6fae07eafeecdd5b672ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 10:13:35 +0200 Subject: [PATCH 191/221] Save WIP on this warp example --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 344 +++--------------- 1 file changed, 53 insertions(+), 291 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index 66fc1f4bc7c..72c33e860de 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -85,188 +85,57 @@ def _stf_call(*args, dim=None, stream=None, **kwargs): def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): - # Process STF dependencies and extract arrays for wp.launch - processed_inputs = [] - stf_dependencies = [] - - # Process inputs to separate STF dependencies from regular arguments - if inputs: - for i, inp in enumerate(inputs): - # Check if input is STF dependency wrapper - if isinstance(inp, STFDependency): - # Extract STF dependency information - stf_dependencies.append( - { - "index": i, - "array": inp.array, - "mode": inp.mode, - "data_place": inp.data_place, - } - ) - # Add unwrapped array to processed inputs - processed_inputs.append(inp.array) - else: - processed_inputs.append(inp) - - # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch - if not stf_dependencies: - raise ValueError( - "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). " - f"Found {len(inputs)} inputs but none are STF dependencies. " - "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)." - ) - - # Print tracing information (controlled by STF_TRACE_ENABLED) - _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies) - - # Extract the STF context from the first dependency - first_dep = stf_dependencies[0] - stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle() - - # Create STF dependency objects for the task - stf_task_deps = [] - for dep in stf_dependencies: - stf_ld = dep["array"]._stf_ld - if dep["mode"] == "read": - stf_task_deps.append(stf_ld.read()) - elif dep["mode"] == "write": - stf_task_deps.append(stf_ld.write()) - elif dep["mode"] == "rw": - stf_task_deps.append(stf_ld.rw()) - - # Create and execute STF task - with stf_ctx.task(*stf_task_deps) as stf_task: - # Get raw CUDA stream pointer from STF task - stf_stream_ptr = stf_task.stream_ptr() - - # Import PyTorch for stream conversion - import torch - - # Get the current CUDA device for PyTorch - warp_device = wp.get_device() - device_id = warp_device.ordinal # Get device number (e.g., 0 for cuda:0) - torch_device = torch.device(f"cuda:{device_id}") - - # Create PyTorch ExternalStream from STF stream pointer with explicit device - torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device) - - # Convert PyTorch stream to Warp stream - warp_stream = wp.stream_from_torch(torch_stream) - - # Launch with properly wrapped STF stream - return wp.launch( - kernel, - dim=dim, - inputs=processed_inputs, - stream=warp_stream, - **kwargs, - ) - - -# STF tracing configuration -STF_TRACE_ENABLED = True # Set to False to disable STF tracing - - -def set_stf_trace(enabled: bool): - """Enable or disable STF tracing output. - - Args: - enabled: True to enable tracing, False to disable - """ - global STF_TRACE_ENABLED - STF_TRACE_ENABLED = enabled - - -def get_stf_trace() -> bool: - """Get current STF tracing state. - - Returns: - True if tracing is enabled, False otherwise - """ - return STF_TRACE_ENABLED - - -def _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies): - """Print STF launch tracing information if enabled.""" - if not STF_TRACE_ENABLED: - return - print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") print(f" dim = {dim}") print(f" stream = {stream}") - # Enhanced input display with STF dependency detection + # Enhanced input display with logical data detection if inputs: print(" inputs = [") for i, inp in enumerate(inputs): - # Check if input is STF dependency wrapper - if isinstance(inp, STFDependency): - # Get symbol for display (STF deps ALWAYS have _stf_ld) - symbol = None - if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol: - symbol = inp.array._stf_ld.symbol - elif hasattr(inp.array, "_name") and inp.array._name: - symbol = inp.array._name + # Detect if input is or contains STF logical data + is_logical_data = False + symbol = None + # Check if inp is directly STF logical data + if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): + is_logical_data = True + if hasattr(inp, "symbol") and inp.symbol: + symbol = inp.symbol + # Check if inp has attached STF logical data (Warp array) + elif hasattr(inp, "_stf_ld"): + is_logical_data = True + if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: + symbol = inp._stf_ld.symbol + # Fallback to _name for Warp arrays + elif hasattr(inp, "_name") and inp._name: + symbol = inp._name + + if is_logical_data: if symbol: - print(f" [{i}]: '{symbol}' [{inp.mode}] [stf_dep]") + print(f" [{i}]: '{symbol}' [logical_data]") else: - print(f" [{i}]: logical_data [{inp.mode}] [stf_dep]") - + print(f" [{i}]: logical_data") else: - # Regular input - detect logical data for display - is_logical_data = False - symbol = None - - # Check if inp is directly STF logical data - if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): - is_logical_data = True - if hasattr(inp, "symbol") and inp.symbol: - symbol = inp.symbol - # Check if inp has attached STF logical data (Warp array) - elif hasattr(inp, "_stf_ld"): - is_logical_data = True - if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: - symbol = inp._stf_ld.symbol - # Fallback to _name for Warp arrays - elif hasattr(inp, "_name") and inp._name: - symbol = inp._name - - if is_logical_data: - if symbol: - print(f" [{i}]: '{symbol}' [logical_data]") - else: - print(f" [{i}]: logical_data") - else: - # Regular arguments (scalars, etc.) - if hasattr(inp, "shape"): # Array-like but not logical data - print(f" [{i}]: {type(inp).__name__}") - else: # Scalar value - print(f" [{i}]: {inp}") - + # Regular arguments (scalars, etc.) + if hasattr(inp, "shape"): # Array-like but not logical data + print(f" [{i}]: {type(inp).__name__}") + else: # Scalar value + print(f" [{i}]: {inp}") print(" ]") else: print(f" inputs = {inputs}") - # Show STF dependency summary - if stf_dependencies: - print(" stf_deps = [") - for dep in stf_dependencies: - # All STF dependencies are guaranteed to have _stf_ld - symbol = None - if hasattr(dep["array"]._stf_ld, "symbol") and dep["array"]._stf_ld.symbol: - symbol = dep["array"]._stf_ld.symbol - elif hasattr(dep["array"], "_name") and dep["array"]._name: - symbol = dep["array"]._name - - if symbol: - print(f" {dep['mode'].upper()}: '{symbol}'") - else: - print(f" {dep['mode'].upper()}: logical_data") - print(" ]") - print(f" kwargs = {kwargs}") - print(" → Creating STF task with dependencies") + + # just forward to warp for now + return wp.launch( + kernel, + dim=dim, + inputs=inputs, + stream=stream, + **kwargs, + ) # put it under wp.stf @@ -278,72 +147,11 @@ class _stf: wp.stf = _stf() -# STF dependency wrapper class -class STFDependency: - """Wrapper for STF task dependencies with access mode specification.""" - - def __init__(self, array, mode, data_place=None): - # CRITICAL: STF dependencies MUST have logical data attached - if not hasattr(array, "_stf_ld"): - raise ValueError( - f"STF dependency requires array with logical data (_stf_ld). " - f"Array {type(array).__name__} does not have STF logical data. " - f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)" - ) - - self.array = array - self.mode = mode # 'read', 'write', 'rw' - self.data_place = data_place - - def __repr__(self): - symbol = None - if hasattr(self.array._stf_ld, "symbol"): - symbol = self.array._stf_ld.symbol - elif hasattr(self.array, "_name"): - symbol = self.array._name - - if symbol: - return f"STFDependency('{symbol}', {self.mode})" - else: - return f"STFDependency({type(self.array).__name__}, {self.mode})" - - -def stf_read(array, data_place=None): - """Mark array as read-only dependency for STF task. - - REQUIRES: array must have _stf_ld (STF logical data) attached. - """ - return STFDependency(array, "read", data_place) - - -def stf_write(array, data_place=None): - """Mark array as write-only dependency for STF task. - - REQUIRES: array must have _stf_ld (STF logical data) attached. - """ - return STFDependency(array, "write", data_place) - - -def stf_rw(array, data_place=None): - """Mark array as read-write dependency for STF task. - - REQUIRES: array must have _stf_ld (STF logical data) attached. - """ - return STFDependency(array, "rw", data_place) - - wp.stf.kernel = stf_kernel wp.stf.launch = stf_launch -wp.stf.read = stf_read -wp.stf.write = stf_write -wp.stf.rw = stf_rw -# STF tracing control functions -wp.stf.set_trace = set_stf_trace -wp.stf.get_trace = get_stf_trace - -grid_width = wp.constant(256 * 4) -grid_height = wp.constant(128 * 4) +grid_width = wp.constant(256) +grid_height = wp.constant(128) @wp.func @@ -498,17 +306,12 @@ class Example: def __init__(self): fps = 60 self.frame_dt = 1.0 / fps - self.sim_substeps = 10 + self.sim_substeps = 2 self.iterations = 100 # Number of pressure iterations self.sim_dt = self.frame_dt / self.sim_substeps self.sim_time = 0.0 - # Create STF context for task-based scheduling - # This enables automatic dependency management and stream orchestration - import torch - - torch.cuda.init() - self._stf_ctx = cudastf.context() # use_graph=True) + self._stf_ctx = cudastf.context() shape = (grid_width, grid_height) @@ -534,6 +337,8 @@ def __init__(self): self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place) self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place) + # vec2 arrays - STF now automatically handles vector type flattening + # Store STF logical data consistently with other arrays self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place) self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place) print( @@ -562,7 +367,7 @@ def __init__(self): self.div._name = "div" # capture pressure solve as a CUDA graph - self.use_cuda_graph = False # wp.get_device().is_cuda + self.use_cuda_graph = wp.get_device().is_cuda if self.use_cuda_graph: with wp.ScopedCapture() as capture: self.pressure_iterations() @@ -579,67 +384,30 @@ def step(self): vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed) # update emitters - wp.stf.launch( - init, - dim=shape, - inputs=[ - wp.stf.write(self.rho0), - wp.stf.write(self.u0), - 5, - vel, - ], - ) + wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel]) # force integrate - wp.stf.launch( - integrate, - dim=shape, - inputs=[ - wp.stf.rw(self.u0), - wp.stf.rw(self.rho0), - dt, - ], - ) - wp.stf.launch( - divergence, - dim=shape, - inputs=[ - wp.stf.read(self.u0), - wp.stf.write(self.div), - ], - ) + wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt]) + wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div]) # pressure solve - # TODO tasks ? self.p0.zero_() self.p1.zero_() - if self.use_cuda_graph: - wp.capture_launch(self.graph) - else: - self.pressure_iterations() + # if self.use_cuda_graph: + # wp.capture_launch(self.graph) + # else: + # self.pressure_iterations() + self.pressure_iterations() # velocity update - wp.stf.launch( - pressure_apply, - dim=shape, - inputs=[ - wp.stf.read(self.p0), - wp.stf.rw(self.u0), - ], - ) + wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0]) # semi-Lagrangian advection wp.stf.launch( advect, dim=shape, - inputs=[ - wp.stf.read(self.u0), - wp.stf.write(self.u1), - wp.stf.read(self.rho0), - wp.stf.write(self.rho1), - dt, - ], + inputs=[self.u0, self.u1, self.rho0, self.rho1, dt], ) # swap buffers @@ -651,13 +419,7 @@ def step(self): def pressure_iterations(self): for _ in range(self.iterations): wp.stf.launch( - pressure_solve, - dim=self.p0.shape, - inputs=[ - wp.stf.read(self.p0), - wp.stf.write(self.p1), - wp.stf.read(self.div), - ], + pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div] ) # swap pressure fields From eba61eb4d6286efb2b4b04623dc1972f1654df7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 13:39:05 +0200 Subject: [PATCH 192/221] Add sanity checks to test the is_void_interface() API --- cudax/examples/stf/void_data_interface.cu | 4 ++++ cudax/test/stf/cpp/task_get_stream.cu | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu index 0340b16bf4a..bf429f23dfe 100644 --- a/cudax/examples/stf/void_data_interface.cu +++ b/cudax/examples/stf/void_data_interface.cu @@ -49,5 +49,9 @@ int main() return cuda_kernel_desc{dummy_kernel, 16, 128, 0}; }; + EXPECT(token.is_void_interface()); + EXPECT(token2.is_void_interface()); + EXPECT(token3.is_void_interface()); + ctx.finalize(); } diff --git a/cudax/test/stf/cpp/task_get_stream.cu b/cudax/test/stf/cpp/task_get_stream.cu index 89fa74e7490..2d6509e5a73 100644 --- a/cudax/test/stf/cpp/task_get_stream.cu +++ b/cudax/test/stf/cpp/task_get_stream.cu @@ -24,7 +24,8 @@ void test_stream() context ctx; auto token = ctx.token(); - auto t = ctx.task(token.write()); + EXPECT(token.is_void_interface()); + auto t = ctx.task(token.write()); t.start(); cudaStream_t s = t.get_stream(); EXPECT(s != nullptr); From e17c261c582e89d044c6fc97034b808bd9373813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 22:04:36 +0200 Subject: [PATCH 193/221] support tokens in python --- .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 59 +++++++++++-- python/cuda_cccl/tests/stf/test_token.py | 86 +++++++++++++++++++ 2 files changed, 137 insertions(+), 8 deletions(-) create mode 100644 python/cuda_cccl/tests/stf/test_token.py diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index 099f6869c46..00b8dd39ed5 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -119,6 +119,8 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_logical_data_destroy(stf_logical_data_handle ld) void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) + void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); + ctypedef struct stf_task_handle_t ctypedef stf_task_handle_t* stf_task_handle void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) @@ -170,6 +172,7 @@ cdef class logical_data: cdef int _ndim cdef size_t _len cdef str _symbol # Store symbol for display purposes + cdef readonly bint _is_token # readonly makes it accessible from Python def __cinit__(self, context ctx=None, object buf=None, data_place dplace=None, shape=None, dtype=None): if ctx is None or buf is None: @@ -181,10 +184,12 @@ cdef class logical_data: self._shape = () self._ndim = 0 self._symbol = None + self._is_token = False return self._ctx = ctx._ctx self._symbol = None # Initialize symbol + self._is_token = False # Initialize token flag # Default to host data place if not specified (matches C++ API) if dplace is None: @@ -270,6 +275,12 @@ cdef class logical_data: stf_logical_data_destroy(self._ld) self._ld = NULL + def __repr__(self): + """Return a detailed string representation of the logical_data object.""" + return (f"logical_data(shape={self._shape}, dtype={self._dtype}, " + f"is_token={self._is_token}, symbol={self._symbol!r}, " + f"len={self._len}, ndim={self._ndim})") + @property def dtype(self): """Return the dtype of the logical data.""" @@ -305,6 +316,21 @@ cdef class logical_data: out._ndim = self._ndim out._len = self._len out._symbol = None # New object has no symbol initially + out._is_token = False + + return out + + @staticmethod + def token(context ctx): + cdef logical_data out = logical_data.__new__(logical_data) + out._ctx = ctx._ctx + out._dtype = None + out._shape = None + out._ndim = 0 + out._len = 0 + out._symbol = None # New object has no symbol initially + out._is_token = True + stf_token(ctx._ctx, &out._ld) return out @@ -320,6 +346,7 @@ cdef class logical_data: out._ndim = len(shape) out._len = math.prod(shape) * out._dtype.itemsize out._symbol = None # New object has no symbol initially + out._is_token = False stf_logical_data_empty(ctx._ctx, out._len, &out._ld) return out @@ -492,6 +519,9 @@ cdef class task: return s # cast pointer -> Py int def get_arg(self, index) -> int: + if self._lds_args[index]._is_token: + raise RuntimeError("cannot materialize a token argument") + cdef void *ptr = stf_task_get(self._t, index) return ptr @@ -508,10 +538,15 @@ cdef class task: return cai_to_numba(cai) def numba_arguments(self): - arg_cnt=len(self._lds_args) - if arg_cnt == 1: - return self.get_arg_numba(0) - return tuple(self.get_arg_numba(i) for i in range(arg_cnt)) + # Only include non-token arguments in the tuple + non_token_args = [self.get_arg_numba(i) for i in range(len(self._lds_args)) + if not self._lds_args[i]._is_token] + + if len(non_token_args) == 0: + return None + elif len(non_token_args) == 1: + return non_token_args[0] + return tuple(non_token_args) def get_arg_as_tensor(self, index): cai = self.get_arg_cai(index) @@ -522,10 +557,15 @@ cdef class task: return cai_to_torch(cai) def tensor_arguments(self): - arg_cnt=len(self._lds_args) - if arg_cnt == 1: - return self.get_arg_as_tensor(0) - return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt)) + # Only include non-token arguments in the tuple + non_token_args = [self.get_arg_as_tensor(i) for i in range(len(self._lds_args)) + if not self._lds_args[i]._is_token] + + if len(non_token_args) == 0: + return None + elif len(non_token_args) == 1: + return non_token_args[0] + return tuple(non_token_args) # ---- context‑manager helpers ------------------------------- def __enter__(self): @@ -830,6 +870,9 @@ cdef class context: dtype = np.float64 return self.logical_data_full(shape, 1.0, dtype, where, exec_place) + def token(self): + return logical_data.token(self) + def task(self, *args): """ Create a `task` diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py new file mode 100644 index 00000000000..acef5e34f3e --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_token.py @@ -0,0 +1,86 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import numba +import numpy as np +from numba import cuda + +import cuda.stf as stf + +numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +def test_token(): + ctx = stf.context() + lX = ctx.token() + lY = ctx.token() + lZ = ctx.token() + + with ctx.task(lX.rw()): + pass + + with ctx.task(lX.read(), lY.rw()): + pass + + with ctx.task(lX.read(), lZ.rw()): + pass + + with ctx.task(lY.read(), lZ.rw()): + pass + + ctx.finalize() + + +@cuda.jit +def axpy(a, x, y): + start = cuda.grid(1) + stride = cuda.gridsize(1) + for i in range(start, x.size, stride): + y[i] = a * x[i] + y[i] + + +def test_numba_token(): + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + token = ctx.token() + + # Use a reasonable grid size - kernel loop will handle all elements + blocks = 32 + threads_per_block = 256 + + with ctx.task(lX.read(), lY.rw(), token.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + print(nb_stream) + dX = t.get_arg_numba(0) + dY = t.get_arg_numba(1) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) + + with ctx.task(lX.read(), lY.rw(), token.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + print(nb_stream) + dX, dY = t.numba_arguments() + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) + + ctx.finalize() + + # Sanity checks: verify the results after finalize + # First task: Y = 2.0 * X + Y = 2.0 * 1.0 + 1.0 = 3.0 + # Second task: Y = 2.0 * X + Y = 2.0 * 1.0 + 3.0 = 5.0 + assert np.allclose(X, 1.0), f"X should still be 1.0 (read-only), but got {X[0]}" + assert np.allclose(Y, 5.0), ( + f"Y should be 5.0 after two axpy operations, but got {Y[0]}" + ) + print(f"✓ X = {X[0]} (expected 1.0)") + print(f"✓ Y = {Y[0]} (expected 5.0)") + + +if __name__ == "__main__": + print("Running CUDASTF examples...") + test_token() From ec9c9553f68ea70cb363e894baea8e50171dd3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 23:04:18 +0200 Subject: [PATCH 194/221] remove debug print --- python/cuda_cccl/tests/stf/example_fluid_warp.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index 72c33e860de..c1d903b9be7 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -341,13 +341,7 @@ def __init__(self): # Store STF logical data consistently with other arrays self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place) self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place) - print( - "✅ Successfully created vec2 STF logical data (automatically flattened by STF)!" - ) - print("✅ All arrays created with explicit data place specification!") - - # Set descriptive symbols for STF logical data (for enhanced tracing) self.rho0._stf_ld.set_symbol("density_current") self.rho1._stf_ld.set_symbol("density_next") self.p0._stf_ld.set_symbol("pressure_current") @@ -355,7 +349,6 @@ def __init__(self): self.div._stf_ld.set_symbol("velocity_divergence") self.u0._stf_ld.set_symbol("velocity_current") self.u1._stf_ld.set_symbol("velocity_next") - print("✅ Set descriptive symbols for STF logical data!") # Set Warp array names (for Warp tracing) self.u0._name = "u0" From 52f48230fa8fc5adfed25e8bcfc9179e85324f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 8 Oct 2025 23:51:57 +0200 Subject: [PATCH 195/221] python cholesky with cupy --- .../cuda_cccl/tests/stf/example_cholesky.py | 529 ++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100755 python/cuda_cccl/tests/stf/example_cholesky.py diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py new file mode 100755 index 00000000000..d696967c3e7 --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_cholesky.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +""" +Python implementation of Cholesky decomposition using CUDA STF and CuPy (CUBLAS/CUSOLVER). + +This example demonstrates: +- Tiled matrix operations with STF logical data +- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks +- Multi-device execution with automatic data placement +- Task-based parallelism for linear algebra operations + +Note: CUDASTF automatically manages device context within tasks via exec_place.device(). +There's no need to manually set the current device in task bodies - just use the STF stream. +""" + +import sys +import numpy as np +import cupy as cp +from cupyx.scipy import linalg as cp_linalg +import cuda.stf as stf + + +class TiledMatrix: + """ + Tiled matrix class that splits a matrix into blocks for parallel processing. + Each block is managed as an STF logical data object. + """ + + def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False, symbol="matrix", dtype=np.float64): + """ + Initialize a tiled matrix. + + Args: + ctx: STF context + nrows: Total number of rows + ncols: Total number of columns + block_rows: Block size (rows) + block_cols: Block size (columns) + is_symmetric: If True, only stores lower triangular blocks + symbol: Name/symbol for the matrix + dtype: Data type (default: np.float64) + """ + self.ctx = ctx + self.symbol = symbol + self.dtype = dtype + + self.m = nrows + self.n = ncols + self.mb = block_rows + self.nb = block_cols + self.sym_matrix = is_symmetric + + assert self.m % self.mb == 0, f"nrows ({self.m}) must be divisible by block_rows ({self.mb})" + assert self.n % self.nb == 0, f"ncols ({self.n}) must be divisible by block_cols ({self.nb})" + + # Number of blocks + self.mt = self.m // self.mb + self.nt = self.n // self.nb + + # Allocate host memory (pinned for faster transfers) + self.h_array = cp.cuda.alloc_pinned_memory(self.m * self.n * np.dtype(dtype).itemsize) + self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(self.m, self.n) + + # Create logical data handles for each block + self.handles = {} + + # Get available devices for mapping + self.ndevs = cp.cuda.runtime.getDeviceCount() + self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) + + print(f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}") + print(f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid") + + # Create blocks + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + # Get tile data from host array (using tiled storage) + tile_data = self._get_block_h(rowb, colb) + + # Create CuPy array on the preferred device + devid = self.get_preferred_devid(rowb, colb) + with cp.cuda.Device(devid): + # Create device array for this block + d_block = cp.asarray(tile_data) + + # Create STF logical data + device_place = stf.data_place.device(devid) + handle = self.ctx.logical_data(d_block, device_place) + handle.set_symbol(f"{symbol}_{rowb}_{colb}") + + self.handles[(rowb, colb)] = (handle, d_block) + + def _compute_device_grid(self, ndevs): + """Compute 2D device grid dimensions (as close to square as possible)""" + grid_p = 1 + grid_q = ndevs + for a in range(1, int(np.sqrt(ndevs)) + 1): + if ndevs % a == 0: + grid_p = a + grid_q = ndevs // a + return grid_p, grid_q + + def get_preferred_devid(self, row, col): + """Get preferred device ID for a given block using cyclic distribution""" + return (row % self.grid_p) + (col % self.grid_q) * self.grid_p + + def handle(self, row, col): + """Get the logical data handle for block (row, col)""" + return self.handles[(row, col)][0] + + def device_array(self, row, col): + """Get the CuPy device array for block (row, col)""" + return self.handles[(row, col)][1] + + def _get_index(self, row, col): + """Convert (row, col) to linear index in tiled storage""" + # Find which tile contains this element + tile_row = row // self.mb + tile_col = col // self.nb + + tile_size = self.mb * self.nb + + # Index of the beginning of the tile + tile_start = (tile_row + self.mt * tile_col) * tile_size + + # Offset within the tile + offset = (row % self.mb) + (col % self.nb) * self.mb + + return tile_start + offset + + def _get_block_h(self, brow, bcol): + """Get a view of the host data for block (brow, bcol)""" + # For tiled storage, blocks are stored contiguously + start_idx = (brow + self.mt * bcol) * self.mb * self.nb + end_idx = start_idx + self.mb * self.nb + flat_view = self.h_array_np.ravel() + return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) + + def fill(self, func): + """Fill matrix using a function func(row, col) -> value""" + print(f"[{self.symbol}] Filling matrix...") + + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + devid = self.get_preferred_devid(rowb, colb) + handle = self.handle(rowb, colb) + d_array = self.device_array(rowb, colb) + + # Fill on host then copy to device + h_block = self._get_block_h(rowb, colb) + for lrow in range(self.mb): + for lcol in range(self.nb): + row = lrow + rowb * self.mb + col = lcol + colb * self.nb + h_block[lrow, lcol] = func(row, col) + + # Copy to device + with cp.cuda.Device(devid): + cp.copyto(d_array, cp.asarray(h_block)) + + def finalize(self): + """Copy all blocks back to host memory""" + print(f"[{self.symbol}] Finalizing (copying back to host)...") + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + devid = self.get_preferred_devid(rowb, colb) + d_array = self.device_array(rowb, colb) + h_block = self._get_block_h(rowb, colb) + + with cp.cuda.Device(devid): + cp.copyto(h_block, cp.asnumpy(d_array)) + + +# BLAS/LAPACK operations wrapped in STF tasks + +def DPOTRF(ctx, A, row, col): + """Cholesky factorization of block (row, col) using CUSOLVER""" + handle = A.handle(row, col) + d_block = A.device_array(row, col) + devid = A.get_preferred_devid(row, col) + + with ctx.task(stf.exec_place.device(devid), handle.rw()) as t: + # STF automatically sets the current device, just use the stream + stream_ptr = t.stream_ptr() + cp_stream = cp.cuda.ExternalStream(stream_ptr) + + with cp_stream: + # Perform Cholesky factorization (lower triangular) - IN PLACE + # CuPy's cholesky returns L where A = L @ L.T + d_block[:] = cp.linalg.cholesky(d_block) + +def DTRSM(ctx, A, a_row, a_col, B, b_row, b_col, side='L', uplo='L', transa='T', diag='N', alpha=1.0): + """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}""" + handle_a = A.handle(a_row, a_col) + handle_b = B.handle(b_row, b_col) + d_a = A.device_array(a_row, a_col) + d_b = B.device_array(b_row, b_col) + devid = B.get_preferred_devid(b_row, b_col) + + with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t: + # STF automatically sets the current device + stream_ptr = t.stream_ptr() + cp_stream = cp.cuda.ExternalStream(stream_ptr) + + with cp_stream: + # Triangular solve using CuPy + # For side='L': solve op(A) @ X = B for X, then X = alpha * X + if side == 'L': + if transa == 'N': + # Solve A @ X = B where A is lower/upper triangular + d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == 'L')) + else: + # Solve A^T @ X = B where A is lower triangular + # This is equivalent to solving U @ X = B where U = A^T is upper + d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != 'L')) + if alpha != 1.0: + d_b *= alpha + else: + # For side='R': solve X @ op(A) = B + # Rewrite as op(A)^T @ X^T = B^T + if transa == 'N': + d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b.T, lower=(uplo != 'L')).T + else: + d_b[:] = cp_linalg.solve_triangular(d_a, d_b.T, lower=(uplo == 'L')).T + if alpha != 1.0: + d_b *= alpha + +def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col, + transa='N', transb='N', alpha=1.0, beta=1.0): + """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C""" + handle_a = A.handle(a_row, a_col) + handle_b = B.handle(b_row, b_col) + handle_c = C.handle(c_row, c_col) + d_a = A.device_array(a_row, a_col) + d_b = B.device_array(b_row, b_col) + d_c = C.device_array(c_row, c_col) + devid = C.get_preferred_devid(c_row, c_col) + + with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()) as t: + # STF automatically sets the current device + stream_ptr = t.stream_ptr() + cp_stream = cp.cuda.ExternalStream(stream_ptr) + + with cp_stream: + # Apply transposes + op_a = d_a.T if transa != 'N' else d_a + op_b = d_b.T if transb != 'N' else d_b + + # C = alpha * op(A) @ op(B) + beta * C (IN PLACE) + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_b) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_b) + else: + d_c[:] = alpha * (op_a @ op_b) + beta * d_c + +def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0, beta=1.0): + """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" + handle_a = A.handle(a_row, a_col) + handle_c = C.handle(c_row, c_col) + d_a = A.device_array(a_row, a_col) + d_c = C.device_array(c_row, c_col) + devid = C.get_preferred_devid(c_row, c_col) + + with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t: + # STF automatically sets the current device + stream_ptr = t.stream_ptr() + cp_stream = cp.cuda.ExternalStream(stream_ptr) + + with cp_stream: + # Apply transpose + op_a = d_a.T if trans != 'N' else d_a + + # C = alpha * op(A) @ op(A).T + beta * C (IN PLACE) + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_a.T) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_a.T) + else: + d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c + + +# High-level algorithms + +def PDPOTRF(ctx, A): + """Parallel tiled Cholesky factorization (blocked algorithm)""" + print(f"\n[PDPOTRF] Starting Cholesky factorization...") + + assert A.m == A.n, "Matrix must be square" + assert A.mt == A.nt, "Block grid must be square" + assert A.sym_matrix, "Matrix must be symmetric" + + nblocks = A.mt + + for k in range(nblocks): + # Factor diagonal block + DPOTRF(ctx, A, k, k) + + # Solve triangular systems for blocks in column k + for row in range(k + 1, nblocks): + DTRSM(ctx, A, k, k, A, row, k, side='R', uplo='L', transa='T', diag='N', alpha=1.0) + + # Update trailing matrix + for col in range(k + 1, row): + DGEMM(ctx, A, row, k, A, col, k, A, row, col, transa='N', transb='T', alpha=-1.0, beta=1.0) + + # Symmetric rank-k update of diagonal block + DSYRK(ctx, A, row, k, A, row, row, uplo='L', trans='N', alpha=-1.0, beta=1.0) + + print(f"[PDPOTRF] Completed") + +def PDTRSM(ctx, A, B, side='L', uplo='L', trans='N', diag='N', alpha=1.0): + """Parallel tiled triangular solve""" + print(f"\n[PDTRSM] Starting triangular solve...") + + if side == 'L': + if uplo == 'L': + if trans == 'N': + # Forward substitution + for k in range(B.mt): + lalpha = alpha if k == 0 else 1.0 + for n in range(B.nt): + DTRSM(ctx, A, k, k, B, k, n, side='L', uplo='L', transa='N', diag=diag, alpha=lalpha) + for m in range(k + 1, B.mt): + for n in range(B.nt): + DGEMM(ctx, A, m, k, B, k, n, B, m, n, transa='N', transb='N', alpha=-1.0, beta=lalpha) + else: # trans == 'T' or 'C' + # Backward substitution + for k in range(B.mt): + lalpha = alpha if k == 0 else 1.0 + for n in range(B.nt): + DTRSM(ctx, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n, + side='L', uplo='L', transa='T', diag=diag, alpha=lalpha) + for m in range(k + 1, B.mt): + for n in range(B.nt): + DGEMM(ctx, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, + B, B.mt - 1 - m, n, transa='T', transb='N', alpha=-1.0, beta=lalpha) + + print(f"[PDTRSM] Completed") + +def PDPOTRS(ctx, A, B, uplo='L'): + """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)""" + print(f"\n[PDPOTRS] Solving linear system...") + + # First solve: L @ Y = B + PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='N' if uplo == 'L' else 'T', diag='N', alpha=1.0) + + # Second solve: L.T @ X = Y + PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='T' if uplo == 'L' else 'N', diag='N', alpha=1.0) + + print(f"[PDPOTRS] Completed") + +def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0): + """Parallel tiled matrix multiplication""" + print(f"\n[PDGEMM] Starting matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + inner_k = A.nt if transa == 'N' else A.mt + + if alpha == 0.0 or inner_k == 0: + # Just scale C + DGEMM(ctx, A, 0, 0, B, 0, 0, C, m, n, transa=transa, transb=transb, alpha=0.0, beta=beta) + elif transa == 'N': + if transb == 'N': + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A, m, k, B, k, n, C, m, n, transa='N', transb='N', alpha=alpha, beta=zbeta) + else: + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A, m, k, B, n, k, C, m, n, transa='N', transb='T', alpha=alpha, beta=zbeta) + else: + if transb == 'N': + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A, k, m, B, k, n, C, m, n, transa='T', transb='N', alpha=alpha, beta=zbeta) + else: + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A, k, m, B, n, k, C, m, n, transa='T', transb='T', alpha=alpha, beta=zbeta) + + print(f"[PDGEMM] Completed") + +def compute_norm(matrix): + """Compute Frobenius norm of matrix""" + norm_sq = 0.0 + for colb in range(matrix.nt): + low_rowb = colb if matrix.sym_matrix else 0 + for rowb in range(low_rowb, matrix.mt): + d_block = matrix.device_array(rowb, colb) + norm_sq += float(cp.sum(d_block * d_block)) + return np.sqrt(norm_sq) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Tiled Cholesky decomposition with CUDA STF') + parser.add_argument('N', type=int, nargs='?', default=1024, help='Matrix size (default: 1024)') + parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)') + parser.add_argument('--check', action='store_true', help='Check result (slower)') + args = parser.parse_args() + + N = args.N + NB = args.NB + check_result = args.check + + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" + + print("="*60) + print("Tiled Cholesky Decomposition with CUDA STF + CuPy") + print("="*60) + print(f"Matrix size: {N}x{N}") + print(f"Block size: {NB}x{NB}") + print(f"Number of blocks: {N//NB}x{N//NB}") + print(f"Check result: {check_result}") + print("="*60) + + # Create STF context + ctx = stf.context() + + # Create matrices + A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") + + if check_result: + Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") + + # Fill with Hilbert matrix + diagonal dominance + # H_{i,j} = 1/(i+j+1) + 2*N if i==j + def hilbert(row, col): + return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0) + + print("\n" + "="*60) + print("Initializing matrices...") + print("="*60) + + A.fill(hilbert) + if check_result: + Aref.fill(hilbert) + + # Create right-hand side + if check_result: + B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B") + Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref") + + def rhs_vals(row, col): + return 1.0 * (row + 1) + + B.fill(rhs_vals) + Bref.fill(rhs_vals) + + # Compute ||B|| for residual calculation + Bref_norm = compute_norm(Bref) + + # Synchronize before timing + cp.cuda.runtime.deviceSynchronize() + + # Record start time + start_event = cp.cuda.Event() + stop_event = cp.cuda.Event() + start_event.record() + + # Perform Cholesky factorization + print("\n" + "="*60) + print("Performing Cholesky factorization...") + print("="*60) + PDPOTRF(ctx, A) + + # Record stop time + stop_event.record() + + # Solve system if checking + if check_result: + print("\n" + "="*60) + print("Solving linear system...") + print("="*60) + PDPOTRS(ctx, A, B, uplo='L') + + print("\n" + "="*60) + print("Computing residual...") + print("="*60) + # Compute residual: Bref = Aref @ B - Bref + PDGEMM(ctx, Aref, B, Bref, transa='N', transb='N', alpha=1.0, beta=-1.0) + + # Compute ||residual|| + res_norm = compute_norm(Bref) + + # Finalize STF context + print("\n" + "="*60) + print("Finalizing STF context...") + print("="*60) + ctx.finalize() + + # Wait for completion + stop_event.synchronize() + + # Compute timing + elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event) + gflops = (1.0/3.0 * N * N * N) / 1e9 + gflops_per_sec = gflops / (elapsed_ms / 1000.0) + + print("\n" + "="*60) + print("Results") + print("="*60) + print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms") + print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS") + + if check_result: + residual = res_norm / Bref_norm + print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}") + print(f"[POTRS] ||B||: {Bref_norm:.6e}") + print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}") + + if residual >= 0.01: + print("\n❌ Algorithm did not converge (residual >= 0.01)") + return 1 + else: + print("\n✅ Algorithm converged successfully!") + + print("="*60) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + From 5a3288188cb2ad0561a9d4db44be93e6b7a5615b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 9 Oct 2025 00:26:16 +0200 Subject: [PATCH 196/221] improve cholesky example --- .../cuda_cccl/tests/stf/example_cholesky.py | 737 ++++++++++++------ 1 file changed, 490 insertions(+), 247 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py index d696967c3e7..faef64d1d73 100755 --- a/python/cuda_cccl/tests/stf/example_cholesky.py +++ b/python/cuda_cccl/tests/stf/example_cholesky.py @@ -13,22 +13,84 @@ """ import sys -import numpy as np + import cupy as cp +import numpy as np from cupyx.scipy import linalg as cp_linalg + import cuda.stf as stf +class CAIWrapper: + """Wrapper to expose CUDA Array Interface dict as a proper CAI object.""" + + def __init__(self, cai_dict): + self.__cuda_array_interface__ = cai_dict + + +def get_cupy_arrays(task): + """ + Get all CuPy arrays from STF task arguments. + + Usage: + d_a, d_b, d_c = get_cupy_arrays(t) + """ + arrays = [] + idx = 0 + while True: + try: + arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) + idx += 1 + except: + break + return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None + + +def cai_to_numpy(cai_dict): + """Convert CUDA Array Interface dict to NumPy array (for host memory).""" + import ctypes + + # Extract CAI fields + data_ptr, readonly = cai_dict["data"] + shape = cai_dict["shape"] + typestr = cai_dict["typestr"] + + # Convert typestr to NumPy dtype + dtype = np.dtype(typestr) + + # Calculate total size in bytes + itemsize = dtype.itemsize + size = np.prod(shape) * itemsize + + # Create ctypes buffer from pointer + buffer = (ctypes.c_byte * size).from_address(data_ptr) + + # Create NumPy array from buffer + arr = np.frombuffer(buffer, dtype=dtype).reshape(shape) + + return arr + + class TiledMatrix: """ Tiled matrix class that splits a matrix into blocks for parallel processing. Each block is managed as an STF logical data object. """ - - def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False, symbol="matrix", dtype=np.float64): + + def __init__( + self, + ctx, + nrows, + ncols, + block_rows, + block_cols, + is_symmetric=False, + symbol="matrix", + dtype=np.float64, + ): """ Initialize a tiled matrix. - + Args: ctx: STF context nrows: Total number of rows @@ -42,54 +104,49 @@ def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False self.ctx = ctx self.symbol = symbol self.dtype = dtype - + self.m = nrows self.n = ncols self.mb = block_rows self.nb = block_cols self.sym_matrix = is_symmetric - - assert self.m % self.mb == 0, f"nrows ({self.m}) must be divisible by block_rows ({self.mb})" - assert self.n % self.nb == 0, f"ncols ({self.n}) must be divisible by block_cols ({self.nb})" - + + assert self.m % self.mb == 0, ( + f"nrows ({self.m}) must be divisible by block_rows ({self.mb})" + ) + assert self.n % self.nb == 0, ( + f"ncols ({self.n}) must be divisible by block_cols ({self.nb})" + ) + # Number of blocks self.mt = self.m // self.mb self.nt = self.n // self.nb - + # Allocate host memory (pinned for faster transfers) - self.h_array = cp.cuda.alloc_pinned_memory(self.m * self.n * np.dtype(dtype).itemsize) - self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(self.m, self.n) - + self.h_array = cp.cuda.alloc_pinned_memory( + self.m * self.n * np.dtype(dtype).itemsize + ) + self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape( + self.m, self.n + ) + # Create logical data handles for each block self.handles = {} - + # Get available devices for mapping self.ndevs = cp.cuda.runtime.getDeviceCount() self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) - - print(f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}") - print(f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid") - - # Create blocks - for colb in range(self.nt): - low_rowb = colb if self.sym_matrix else 0 - for rowb in range(low_rowb, self.mt): - # Get tile data from host array (using tiled storage) - tile_data = self._get_block_h(rowb, colb) - - # Create CuPy array on the preferred device - devid = self.get_preferred_devid(rowb, colb) - with cp.cuda.Device(devid): - # Create device array for this block - d_block = cp.asarray(tile_data) - - # Create STF logical data - device_place = stf.data_place.device(devid) - handle = self.ctx.logical_data(d_block, device_place) - handle.set_symbol(f"{symbol}_{rowb}_{colb}") - - self.handles[(rowb, colb)] = (handle, d_block) - + + print( + f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}" + ) + print( + f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid" + ) + + # Note: We DON'T create logical data here yet - that happens in fill() + # after the host data is initialized + def _compute_device_grid(self, ndevs): """Compute 2D device grid dimensions (as close to square as possible)""" grid_p = 1 @@ -99,35 +156,31 @@ def _compute_device_grid(self, ndevs): grid_p = a grid_q = ndevs // a return grid_p, grid_q - + def get_preferred_devid(self, row, col): """Get preferred device ID for a given block using cyclic distribution""" return (row % self.grid_p) + (col % self.grid_q) * self.grid_p - + def handle(self, row, col): """Get the logical data handle for block (row, col)""" - return self.handles[(row, col)][0] - - def device_array(self, row, col): - """Get the CuPy device array for block (row, col)""" - return self.handles[(row, col)][1] - + return self.handles[(row, col)] + def _get_index(self, row, col): """Convert (row, col) to linear index in tiled storage""" # Find which tile contains this element tile_row = row // self.mb tile_col = col // self.nb - + tile_size = self.mb * self.nb - + # Index of the beginning of the tile tile_start = (tile_row + self.mt * tile_col) * tile_size - + # Offset within the tile offset = (row % self.mb) + (col % self.nb) * self.mb - + return tile_start + offset - + def _get_block_h(self, brow, bcol): """Get a view of the host data for block (brow, bcol)""" # For tiled storage, blocks are stored contiguously @@ -135,120 +188,114 @@ def _get_block_h(self, brow, bcol): end_idx = start_idx + self.mb * self.nb flat_view = self.h_array_np.ravel() return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) - + def fill(self, func): - """Fill matrix using a function func(row, col) -> value""" - print(f"[{self.symbol}] Filling matrix...") - + """Fill matrix on host, then create STF logical data that will transfer automatically""" + print(f"[{self.symbol}] Filling matrix on host...") + for colb in range(self.nt): low_rowb = colb if self.sym_matrix else 0 for rowb in range(low_rowb, self.mt): - devid = self.get_preferred_devid(rowb, colb) - handle = self.handle(rowb, colb) - d_array = self.device_array(rowb, colb) - - # Fill on host then copy to device + # Fill host block h_block = self._get_block_h(rowb, colb) for lrow in range(self.mb): for lcol in range(self.nb): row = lrow + rowb * self.mb col = lcol + colb * self.nb h_block[lrow, lcol] = func(row, col) - - # Copy to device - with cp.cuda.Device(devid): - cp.copyto(d_array, cp.asarray(h_block)) - - def finalize(self): - """Copy all blocks back to host memory""" - print(f"[{self.symbol}] Finalizing (copying back to host)...") - for colb in range(self.nt): - low_rowb = colb if self.sym_matrix else 0 - for rowb in range(low_rowb, self.mt): - devid = self.get_preferred_devid(rowb, colb) - d_array = self.device_array(rowb, colb) - h_block = self._get_block_h(rowb, colb) - - with cp.cuda.Device(devid): - cp.copyto(h_block, cp.asnumpy(d_array)) + + handle = self.ctx.logical_data(h_block) + handle.set_symbol(f"{self.symbol}_{rowb}_{colb}") + + self.handles[(rowb, colb)] = handle # BLAS/LAPACK operations wrapped in STF tasks + def DPOTRF(ctx, A, row, col): """Cholesky factorization of block (row, col) using CUSOLVER""" handle = A.handle(row, col) - d_block = A.device_array(row, col) devid = A.get_preferred_devid(row, col) - + with ctx.task(stf.exec_place.device(devid), handle.rw()) as t: - # STF automatically sets the current device, just use the stream - stream_ptr = t.stream_ptr() - cp_stream = cp.cuda.ExternalStream(stream_ptr) - - with cp_stream: - # Perform Cholesky factorization (lower triangular) - IN PLACE - # CuPy's cholesky returns L where A = L @ L.T + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): d_block[:] = cp.linalg.cholesky(d_block) -def DTRSM(ctx, A, a_row, a_col, B, b_row, b_col, side='L', uplo='L', transa='T', diag='N', alpha=1.0): + +def DTRSM( + ctx, + A, + a_row, + a_col, + B, + b_row, + b_col, + side="L", + uplo="L", + transa="T", + diag="N", + alpha=1.0, +): """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}""" handle_a = A.handle(a_row, a_col) handle_b = B.handle(b_row, b_col) - d_a = A.device_array(a_row, a_col) - d_b = B.device_array(b_row, b_col) devid = B.get_preferred_devid(b_row, b_col) - + with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t: - # STF automatically sets the current device - stream_ptr = t.stream_ptr() - cp_stream = cp.cuda.ExternalStream(stream_ptr) - - with cp_stream: - # Triangular solve using CuPy - # For side='L': solve op(A) @ X = B for X, then X = alpha * X - if side == 'L': - if transa == 'N': - # Solve A @ X = B where A is lower/upper triangular - d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == 'L')) + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + if side == "L": + if transa == "N": + d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == "L")) else: - # Solve A^T @ X = B where A is lower triangular - # This is equivalent to solving U @ X = B where U = A^T is upper - d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != 'L')) + d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != "L")) if alpha != 1.0: d_b *= alpha else: - # For side='R': solve X @ op(A) = B - # Rewrite as op(A)^T @ X^T = B^T - if transa == 'N': - d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b.T, lower=(uplo != 'L')).T + if transa == "N": + d_b[:] = cp_linalg.solve_triangular( + d_a.T, d_b.T, lower=(uplo != "L") + ).T else: - d_b[:] = cp_linalg.solve_triangular(d_a, d_b.T, lower=(uplo == 'L')).T + d_b[:] = cp_linalg.solve_triangular( + d_a, d_b.T, lower=(uplo == "L") + ).T if alpha != 1.0: d_b *= alpha -def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col, - transa='N', transb='N', alpha=1.0, beta=1.0): + +def DGEMM( + ctx, + A, + a_row, + a_col, + B, + b_row, + b_col, + C, + c_row, + c_col, + transa="N", + transb="N", + alpha=1.0, + beta=1.0, +): """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C""" handle_a = A.handle(a_row, a_col) handle_b = B.handle(b_row, b_col) handle_c = C.handle(c_row, c_col) - d_a = A.device_array(a_row, a_col) - d_b = B.device_array(b_row, b_col) - d_c = C.device_array(c_row, c_col) devid = C.get_preferred_devid(c_row, c_col) - - with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()) as t: - # STF automatically sets the current device - stream_ptr = t.stream_ptr() - cp_stream = cp.cuda.ExternalStream(stream_ptr) - - with cp_stream: - # Apply transposes - op_a = d_a.T if transa != 'N' else d_a - op_b = d_b.T if transb != 'N' else d_b - - # C = alpha * op(A) @ op(B) + beta * C (IN PLACE) + + with ctx.task( + stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw() + ) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if transa != "N" else d_a + op_b = d_b.T if transb != "N" else d_b + if beta == 0.0: d_c[:] = alpha * (op_a @ op_b) elif beta == 1.0: @@ -256,24 +303,20 @@ def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col, else: d_c[:] = alpha * (op_a @ op_b) + beta * d_c -def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0, beta=1.0): + +def DSYRK( + ctx, A, a_row, a_col, C, c_row, c_col, uplo="L", trans="N", alpha=1.0, beta=1.0 +): """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" handle_a = A.handle(a_row, a_col) handle_c = C.handle(c_row, c_col) - d_a = A.device_array(a_row, a_col) - d_c = C.device_array(c_row, c_col) devid = C.get_preferred_devid(c_row, c_col) - + with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t: - # STF automatically sets the current device - stream_ptr = t.stream_ptr() - cp_stream = cp.cuda.ExternalStream(stream_ptr) - - with cp_stream: - # Apply transpose - op_a = d_a.T if trans != 'N' else d_a - - # C = alpha * op(A) @ op(A).T + beta * C (IN PLACE) + d_a, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if trans != "N" else d_a + if beta == 0.0: d_c[:] = alpha * (op_a @ op_a.T) elif beta == 1.0: @@ -284,246 +327,446 @@ def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0, # High-level algorithms + def PDPOTRF(ctx, A): """Parallel tiled Cholesky factorization (blocked algorithm)""" - print(f"\n[PDPOTRF] Starting Cholesky factorization...") - + print("\n[PDPOTRF] Starting Cholesky factorization...") + assert A.m == A.n, "Matrix must be square" assert A.mt == A.nt, "Block grid must be square" assert A.sym_matrix, "Matrix must be symmetric" - + nblocks = A.mt - + for k in range(nblocks): # Factor diagonal block DPOTRF(ctx, A, k, k) - + # Solve triangular systems for blocks in column k for row in range(k + 1, nblocks): - DTRSM(ctx, A, k, k, A, row, k, side='R', uplo='L', transa='T', diag='N', alpha=1.0) - + DTRSM( + ctx, + A, + k, + k, + A, + row, + k, + side="R", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + # Update trailing matrix for col in range(k + 1, row): - DGEMM(ctx, A, row, k, A, col, k, A, row, col, transa='N', transb='T', alpha=-1.0, beta=1.0) - + DGEMM( + ctx, + A, + row, + k, + A, + col, + k, + A, + row, + col, + transa="N", + transb="T", + alpha=-1.0, + beta=1.0, + ) + # Symmetric rank-k update of diagonal block - DSYRK(ctx, A, row, k, A, row, row, uplo='L', trans='N', alpha=-1.0, beta=1.0) - - print(f"[PDPOTRF] Completed") + DSYRK( + ctx, A, row, k, A, row, row, uplo="L", trans="N", alpha=-1.0, beta=1.0 + ) + + print("[PDPOTRF] Completed") + -def PDTRSM(ctx, A, B, side='L', uplo='L', trans='N', diag='N', alpha=1.0): +def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): """Parallel tiled triangular solve""" - print(f"\n[PDTRSM] Starting triangular solve...") - - if side == 'L': - if uplo == 'L': - if trans == 'N': + print("\n[PDTRSM] Starting triangular solve...") + + if side == "L": + if uplo == "L": + if trans == "N": # Forward substitution for k in range(B.mt): lalpha = alpha if k == 0 else 1.0 for n in range(B.nt): - DTRSM(ctx, A, k, k, B, k, n, side='L', uplo='L', transa='N', diag=diag, alpha=lalpha) + DTRSM( + ctx, + A, + k, + k, + B, + k, + n, + side="L", + uplo="L", + transa="N", + diag=diag, + alpha=lalpha, + ) for m in range(k + 1, B.mt): for n in range(B.nt): - DGEMM(ctx, A, m, k, B, k, n, B, m, n, transa='N', transb='N', alpha=-1.0, beta=lalpha) + DGEMM( + ctx, + A, + m, + k, + B, + k, + n, + B, + m, + n, + transa="N", + transb="N", + alpha=-1.0, + beta=lalpha, + ) else: # trans == 'T' or 'C' # Backward substitution for k in range(B.mt): lalpha = alpha if k == 0 else 1.0 for n in range(B.nt): - DTRSM(ctx, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n, - side='L', uplo='L', transa='T', diag=diag, alpha=lalpha) + DTRSM( + ctx, + A, + B.mt - k - 1, + B.mt - k - 1, + B, + B.mt - k - 1, + n, + side="L", + uplo="L", + transa="T", + diag=diag, + alpha=lalpha, + ) for m in range(k + 1, B.mt): for n in range(B.nt): - DGEMM(ctx, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, - B, B.mt - 1 - m, n, transa='T', transb='N', alpha=-1.0, beta=lalpha) - - print(f"[PDTRSM] Completed") + DGEMM( + ctx, + A, + B.mt - k - 1, + B.mt - 1 - m, + B, + B.mt - k - 1, + n, + B, + B.mt - 1 - m, + n, + transa="T", + transb="N", + alpha=-1.0, + beta=lalpha, + ) + + print("[PDTRSM] Completed") + -def PDPOTRS(ctx, A, B, uplo='L'): +def PDPOTRS(ctx, A, B, uplo="L"): """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)""" - print(f"\n[PDPOTRS] Solving linear system...") - + print("\n[PDPOTRS] Solving linear system...") + # First solve: L @ Y = B - PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='N' if uplo == 'L' else 'T', diag='N', alpha=1.0) - + PDTRSM( + ctx, + A, + B, + side="L", + uplo=uplo, + trans="N" if uplo == "L" else "T", + diag="N", + alpha=1.0, + ) + # Second solve: L.T @ X = Y - PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='T' if uplo == 'L' else 'N', diag='N', alpha=1.0) - - print(f"[PDPOTRS] Completed") + PDTRSM( + ctx, + A, + B, + side="L", + uplo=uplo, + trans="T" if uplo == "L" else "N", + diag="N", + alpha=1.0, + ) + + print("[PDPOTRS] Completed") + -def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0): +def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): """Parallel tiled matrix multiplication""" - print(f"\n[PDGEMM] Starting matrix multiplication...") - + print("\n[PDGEMM] Starting matrix multiplication...") + for m in range(C.mt): for n in range(C.nt): - inner_k = A.nt if transa == 'N' else A.mt - + inner_k = A.nt if transa == "N" else A.mt + if alpha == 0.0 or inner_k == 0: # Just scale C - DGEMM(ctx, A, 0, 0, B, 0, 0, C, m, n, transa=transa, transb=transb, alpha=0.0, beta=beta) - elif transa == 'N': - if transb == 'N': + DGEMM( + ctx, + A, + 0, + 0, + B, + 0, + 0, + C, + m, + n, + transa=transa, + transb=transb, + alpha=0.0, + beta=beta, + ) + elif transa == "N": + if transb == "N": for k in range(A.nt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A, m, k, B, k, n, C, m, n, transa='N', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A, + m, + k, + B, + k, + n, + C, + m, + n, + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: for k in range(A.nt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A, m, k, B, n, k, C, m, n, transa='N', transb='T', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A, + m, + k, + B, + n, + k, + C, + m, + n, + transa="N", + transb="T", + alpha=alpha, + beta=zbeta, + ) else: - if transb == 'N': + if transb == "N": for k in range(A.mt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A, k, m, B, k, n, C, m, n, transa='T', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A, + k, + m, + B, + k, + n, + C, + m, + n, + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: for k in range(A.mt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A, k, m, B, n, k, C, m, n, transa='T', transb='T', alpha=alpha, beta=zbeta) - - print(f"[PDGEMM] Completed") + DGEMM( + ctx, + A, + k, + m, + B, + n, + k, + C, + m, + n, + transa="T", + transb="T", + alpha=alpha, + beta=zbeta, + ) + + print("[PDGEMM] Completed") -def compute_norm(matrix): - """Compute Frobenius norm of matrix""" + +def compute_norm(ctx, matrix): + """Compute Frobenius norm of matrix using host tasks""" norm_sq = 0.0 + for colb in range(matrix.nt): low_rowb = colb if matrix.sym_matrix else 0 for rowb in range(low_rowb, matrix.mt): - d_block = matrix.device_array(rowb, colb) - norm_sq += float(cp.sum(d_block * d_block)) + handle = matrix.handle(rowb, colb) + + # Host task to read the block and compute norm + def compute_block_norm(h_block): + nonlocal norm_sq + norm_sq += np.sum(h_block * h_block) + + with ctx.task(stf.exec_place.host(), handle.read()) as t: + # Synchronize the stream before reading data + cp.cuda.runtime.streamSynchronize(t.stream_ptr()) + + h_block = cai_to_numpy(t.get_arg_cai(0)) + compute_block_norm(h_block) + return np.sqrt(norm_sq) def main(): import argparse - - parser = argparse.ArgumentParser(description='Tiled Cholesky decomposition with CUDA STF') - parser.add_argument('N', type=int, nargs='?', default=1024, help='Matrix size (default: 1024)') - parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)') - parser.add_argument('--check', action='store_true', help='Check result (slower)') + + parser = argparse.ArgumentParser( + description="Tiled Cholesky decomposition with CUDA STF" + ) + parser.add_argument( + "N", type=int, nargs="?", default=1024, help="Matrix size (default: 1024)" + ) + parser.add_argument( + "NB", type=int, nargs="?", default=128, help="Block size (default: 128)" + ) + parser.add_argument("--check", action="store_true", help="Check result (slower)") args = parser.parse_args() - + N = args.N NB = args.NB check_result = args.check - + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" - - print("="*60) + + print("=" * 60) print("Tiled Cholesky Decomposition with CUDA STF + CuPy") - print("="*60) + print("=" * 60) print(f"Matrix size: {N}x{N}") print(f"Block size: {NB}x{NB}") - print(f"Number of blocks: {N//NB}x{N//NB}") + print(f"Number of blocks: {N // NB}x{N // NB}") print(f"Check result: {check_result}") - print("="*60) - + print("=" * 60) + # Create STF context ctx = stf.context() - + # Create matrices A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") - + if check_result: Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") - + # Fill with Hilbert matrix + diagonal dominance # H_{i,j} = 1/(i+j+1) + 2*N if i==j def hilbert(row, col): return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0) - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Initializing matrices...") - print("="*60) - + print("=" * 60) + A.fill(hilbert) if check_result: Aref.fill(hilbert) - + # Create right-hand side if check_result: B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B") Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref") - + def rhs_vals(row, col): return 1.0 * (row + 1) - + B.fill(rhs_vals) Bref.fill(rhs_vals) - + # Compute ||B|| for residual calculation - Bref_norm = compute_norm(Bref) - + Bref_norm = compute_norm(ctx, Bref) + # Synchronize before timing cp.cuda.runtime.deviceSynchronize() - + # Record start time start_event = cp.cuda.Event() stop_event = cp.cuda.Event() start_event.record() - + # Perform Cholesky factorization - print("\n" + "="*60) + print("\n" + "=" * 60) print("Performing Cholesky factorization...") - print("="*60) + print("=" * 60) PDPOTRF(ctx, A) - + # Record stop time stop_event.record() - + # Solve system if checking if check_result: - print("\n" + "="*60) + print("\n" + "=" * 60) print("Solving linear system...") - print("="*60) - PDPOTRS(ctx, A, B, uplo='L') - - print("\n" + "="*60) + print("=" * 60) + PDPOTRS(ctx, A, B, uplo="L") + + print("\n" + "=" * 60) print("Computing residual...") - print("="*60) + print("=" * 60) # Compute residual: Bref = Aref @ B - Bref - PDGEMM(ctx, Aref, B, Bref, transa='N', transb='N', alpha=1.0, beta=-1.0) - + PDGEMM(ctx, Aref, B, Bref, transa="N", transb="N", alpha=1.0, beta=-1.0) + # Compute ||residual|| - res_norm = compute_norm(Bref) - + res_norm = compute_norm(ctx, Bref) + # Finalize STF context - print("\n" + "="*60) + print("\n" + "=" * 60) print("Finalizing STF context...") - print("="*60) + print("=" * 60) ctx.finalize() - + # Wait for completion stop_event.synchronize() - + # Compute timing elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event) - gflops = (1.0/3.0 * N * N * N) / 1e9 + gflops = (1.0 / 3.0 * N * N * N) / 1e9 gflops_per_sec = gflops / (elapsed_ms / 1000.0) - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Results") - print("="*60) + print("=" * 60) print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms") print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS") - + if check_result: residual = res_norm / Bref_norm print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}") print(f"[POTRS] ||B||: {Bref_norm:.6e}") print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}") - + if residual >= 0.01: print("\n❌ Algorithm did not converge (residual >= 0.01)") return 1 else: print("\n✅ Algorithm converged successfully!") - - print("="*60) + + print("=" * 60) return 0 if __name__ == "__main__": sys.exit(main()) - From abd577817bcecce9150c20842cabc8173cc69fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 9 Oct 2025 15:04:36 +0200 Subject: [PATCH 197/221] POTRI and Cholesky --- python/cuda_cccl/tests/stf/example_potri.py | 629 ++++++++++++++++++++ 1 file changed, 629 insertions(+) create mode 100644 python/cuda_cccl/tests/stf/example_potri.py diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py new file mode 100644 index 00000000000..0be70fc9338 --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_potri.py @@ -0,0 +1,629 @@ +#!/usr/bin/env python3 +""" +Python implementation of POTRI (matrix inversion via Cholesky) using CUDA STF and CuPy. + +POTRI computes the inverse of a symmetric positive definite matrix using its Cholesky factorization: +1. Cholesky factorization: A = L*L^T +2. Triangular inversion: L^(-1) +3. Compute A^(-1) = L^(-T) * L^(-1) + +This example demonstrates: +- Tiled matrix operations with STF logical data +- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks +- Multi-device execution with automatic data placement +- Task-based parallelism for linear algebra operations +""" + +import sys +import numpy as np +import cupy as cp +from cupyx.scipy import linalg as cp_linalg +import cuda.stf as stf + + +class CAIWrapper: + """Wrapper to expose CUDA Array Interface dict as a proper CAI object.""" + def __init__(self, cai_dict): + self.__cuda_array_interface__ = cai_dict + + +def get_cupy_arrays(task): + """ + Get all CuPy arrays from STF task arguments. + + Usage: + d_a, d_b, d_c = get_cupy_arrays(t) + """ + arrays = [] + idx = 0 + while True: + try: + arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) + idx += 1 + except: + break + return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None + + +def cai_to_numpy(cai_dict): + """Convert CUDA Array Interface dict to NumPy array (for host memory).""" + import ctypes + + # Extract CAI fields + data_ptr, readonly = cai_dict['data'] + shape = cai_dict['shape'] + typestr = cai_dict['typestr'] + + # Convert typestr to NumPy dtype + dtype = np.dtype(typestr) + + # Calculate total size in bytes + itemsize = dtype.itemsize + size = np.prod(shape) * itemsize + + # Create ctypes buffer from pointer + buffer = (ctypes.c_byte * size).from_address(data_ptr) + + # Create NumPy array from buffer + arr = np.frombuffer(buffer, dtype=dtype).reshape(shape) + + return arr + + +class BlockRef: + """Reference to a specific block in a tiled matrix.""" + def __init__(self, matrix, row, col): + self.matrix = matrix + self.row = row + self.col = col + self._handle = matrix.handle(row, col) + self._devid = matrix.get_preferred_devid(row, col) + + def handle(self): + """Get the STF logical data handle for this block.""" + return self._handle + + def devid(self): + """Get the preferred device ID for this block.""" + return self._devid + + def __repr__(self): + return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])" + + +class TiledMatrix: + """ + Tiled matrix class that splits a matrix into blocks for parallel processing. + Each block is managed as an STF logical data object. + Uses tiled storage format for contiguous blocks. + """ + def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols, + is_symmetric=False, symbol="matrix", dtype=np.float64): + self.ctx = ctx + self.symbol = symbol + self.dtype = dtype + self.sym_matrix = is_symmetric + + self.m = nrows + self.n = ncols + self.mb = blocksize_rows + self.nb = blocksize_cols + + assert self.m % self.mb == 0, f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}" + assert self.n % self.nb == 0, f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}" + + # Number of blocks + self.mt = self.m // self.mb + self.nt = self.n // self.nb + + # Allocate pinned host memory for faster transfers (in tiled format) + self.h_array = cp.cuda.alloc_pinned_memory( + self.m * self.n * np.dtype(dtype).itemsize + ) + self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape( + self.m, self.n + ) + + # Dictionary to store logical data handles for each block + self.handles = {} + + # Determine device layout + self.ndevs = cp.cuda.runtime.getDeviceCount() + self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) + + print(f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}") + print(f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid") + + def _compute_device_grid(self, ndevs): + """Compute 2D device grid dimensions (as close to square as possible)""" + grid_p = 1 + grid_q = ndevs + for a in range(1, int(np.sqrt(ndevs)) + 1): + if ndevs % a == 0: + grid_p = a + grid_q = ndevs // a + return grid_p, grid_q + + def get_preferred_devid(self, row, col): + """Get preferred device ID for a given block using cyclic distribution""" + return (row % self.grid_p) + (col % self.grid_q) * self.grid_p + + def handle(self, row, col): + """Get the logical data handle for a block.""" + return self.handles[(row, col)] + + def block(self, row, col): + """Get a BlockRef for block (row, col)""" + return BlockRef(self, row, col) + + def _get_index(self, row, col): + """Convert (row, col) to linear index in tiled storage""" + tile_row = row // self.mb + tile_col = col // self.nb + tile_size = self.mb * self.nb + tile_start = (tile_row + self.mt * tile_col) * tile_size + offset = (row % self.mb) + (col % self.nb) * self.mb + return tile_start + offset + + def _get_block_h(self, brow, bcol): + """Get a view of the host data for block (brow, bcol)""" + # For tiled storage, blocks are stored contiguously + start_idx = (brow + self.mt * bcol) * self.mb * self.nb + end_idx = start_idx + self.mb * self.nb + flat_view = self.h_array_np.ravel() + return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) + + def fill(self, func): + """ + Fill the matrix blocks using a function func(row, col) -> value. + Creates STF logical data from host arrays and lets STF handle transfers. + """ + print(f"[{self.symbol}] Filling matrix on host...") + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + # Fill host block + h_block = self._get_block_h(rowb, colb) + for lrow in range(self.mb): + for lcol in range(self.nb): + row = lrow + rowb * self.mb + col = lcol + colb * self.nb + h_block[lrow, lcol] = func(row, col) + + handle = self.ctx.logical_data(h_block) + handle.set_symbol(f"{self.symbol}_{rowb}_{colb}") + + self.handles[(rowb, colb)] = handle + + +# ============================================================================ +# Block-level operations (BLAS/LAPACK) +# ============================================================================ + +def DPOTRF(ctx, a): + """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + d_block[:] = cp.linalg.cholesky(d_block) + + +def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0): + """Triangular solve: B = alpha * op(A)^(-1) * B""" + with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t: + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = (uplo == 'L') + trans = (transa != 'N') + result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans) + if alpha != 1.0: + d_b[:] = alpha * result + else: + d_b[:] = result + + +def DTRTRI(ctx, a, uplo='L', diag='N'): + """Triangular matrix inversion: A = A^(-1)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = (uplo == 'L') + unit_diagonal = (diag == 'U') + # CuPy doesn't have trtri directly, use solve with identity + n = d_block.shape[0] + identity = cp.eye(n, dtype=d_block.dtype) + d_block[:] = cp_linalg.solve_triangular(d_block, identity, lower=lower, unit_diagonal=unit_diagonal) + + +def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0): + """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C""" + with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if transa != 'N' else d_a + op_b = d_b.T if transb != 'N' else d_b + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_b) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_b) + else: + d_c[:] = alpha * (op_a @ op_b) + beta * d_c + + +def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0): + """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" + with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()) as t: + d_a, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if trans != 'N' else d_a + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_a.T) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_a.T) + else: + d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c + + +def DTRMM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0): + """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')""" + with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t: + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = (uplo == 'L') + trans = (transa != 'N') + + # Extract triangle from A + if lower: + tri_a = cp.tril(d_a) + else: + tri_a = cp.triu(d_a) + + if trans: + tri_a = tri_a.T + + if side == 'L': + d_b[:] = alpha * (tri_a @ d_b) + else: # side == 'R' + d_b[:] = alpha * (d_b @ tri_a) + + +def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0): + """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R') + where A is symmetric.""" + with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + # Reconstruct full symmetric matrix from lower/upper triangle + if uplo == 'L': + # Lower triangle is stored + sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T + else: + # Upper triangle is stored + sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T + + if side == 'L': + result = alpha * (sym_a @ d_b) + else: # side == 'R' + result = alpha * (d_b @ sym_a) + + if beta == 0.0: + d_c[:] = result + elif beta == 1.0: + d_c[:] += result + else: + d_c[:] = result + beta * d_c + + +# ============================================================================ +# Tiled operations +# ============================================================================ + +def PDPOTRF(ctx, A, uplo='L'): + """Parallel tiled Cholesky factorization""" + print(f"\n[PDPOTRF] Starting Cholesky factorization...") + assert uplo == 'L', "Only lower triangular factorization supported" + + for k in range(A.nt): + # Factorize diagonal block + DPOTRF(ctx, A.block(k, k)) + + # Update column below diagonal + for m in range(k + 1, A.mt): + DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='T', diag='N', alpha=1.0) + + # Update trailing submatrix + for n in range(k + 1, A.nt): + DSYRK(ctx, A.block(n, k), A.block(n, n), uplo='L', trans='N', alpha=-1.0, beta=1.0) + + for m in range(n + 1, A.mt): + DGEMM(ctx, A.block(m, k), A.block(n, k), A.block(m, n), transa='N', transb='T', alpha=-1.0, beta=1.0) + + print(f"[PDPOTRF] Completed") + + +def PDTRTRI(ctx, A, uplo='L', diag='N'): + """Parallel tiled triangular matrix inversion""" + print(f"\n[PDTRTRI] Starting triangular inversion...") + assert uplo == 'L', "Only lower triangular inversion supported" + + for k in range(A.nt): + # Step 1: Update A[m,k] for m > k + for m in range(k + 1, A.mt): + DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='N', diag=diag, alpha=-1.0) + + # Step 2: Update A[m,n] for m > k, n < k + for m in range(k + 1, A.mt): + for n in range(k): + DGEMM(ctx, A.block(m, k), A.block(k, n), A.block(m, n), transa='N', transb='N', alpha=1.0, beta=1.0) + + # Step 3: Update A[k,n] for n < k + for n in range(k): + DTRSM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='N', diag=diag, alpha=1.0) + + # Step 4: Invert diagonal block A[k,k] + DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag) + + print(f"[PDTRTRI] Completed") + + +def DLAAUM(ctx, a, uplo='L'): + """Compute A^T * A for a triangular block (lauum operation)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + # lauum: compute L * L^T for lower triangular L + if uplo == 'L': + L = cp.tril(d_block) + d_block[:] = L @ L.T + else: + U = cp.triu(d_block) + d_block[:] = U.T @ U + + +def PDLAUUM(ctx, A, uplo='L'): + """Parallel tiled computation of A^T * A for lower triangular A""" + print(f"\n[PDLAUUM] Starting LAUUM (A^T * A)...") + assert uplo == 'L', "Only lower triangular LAUUM supported" + + for k in range(A.mt): + # Step 1: Update off-diagonal blocks + for n in range(k): + # Update A[n,n] with A[k,n]^T * A[k,n] + DSYRK(ctx, A.block(k, n), A.block(n, n), uplo='L', trans='T', alpha=1.0, beta=1.0) + + # Update A[m,n] with A[k,m]^T * A[k,n] + for m in range(n + 1, k): + DGEMM(ctx, A.block(k, m), A.block(k, n), A.block(m, n), transa='T', transb='N', alpha=1.0, beta=1.0) + + # Step 2: Update A[k,n] = A[k,k]^T * A[k,n] + for n in range(k): + DTRMM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='T', diag='N', alpha=1.0) + + # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k] + DLAAUM(ctx, A.block(k, k), uplo=uplo) + + print(f"[PDLAUUM] Completed") + + +def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0): + """Parallel tiled matrix multiplication""" + print(f"\n[PDGEMM] Starting matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + inner_k = A.nt if transa == 'N' else A.mt + + if alpha == 0.0 or inner_k == 0: + # Just scale C + DGEMM(ctx, A.block(0, 0), B.block(0, 0), C.block(m, n), transa=transa, transb=transb, alpha=0.0, beta=beta) + elif transa == 'N': + if transb == 'N': + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta) + else: + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A.block(m, k), B.block(n, k), C.block(m, n), transa='N', transb='T', alpha=alpha, beta=zbeta) + else: # transa in ['T', 'C'] + if transb == 'N': + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta) + else: + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM(ctx, A.block(k, m), B.block(n, k), C.block(m, n), transa='T', transb='T', alpha=alpha, beta=zbeta) + + print(f"[PDGEMM] Completed") + + +def PDSYMM(ctx, A, B, C, side='L', uplo='L', alpha=1.0, beta=1.0): + """Parallel tiled symmetric matrix multiplication""" + print(f"\n[PDSYMM] Starting symmetric matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + if side == 'L': + if uplo == 'L': + for k in range(C.mt): + zbeta = beta if k == 0 else 1.0 + if k < m: + DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta) + else: + if k == m: + DSYMM(ctx, A.block(k, k), B.block(k, n), C.block(m, n), side=side, uplo=uplo, alpha=alpha, beta=zbeta) + else: + DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta) + else: # side == 'R' + # Similar logic for right multiplication + pass + + print(f"[PDSYMM] Completed") + + +def compute_norm(ctx, matrix): + """Compute Frobenius norm of matrix using host tasks""" + norm_sq = 0.0 + + for colb in range(matrix.nt): + low_rowb = colb if matrix.sym_matrix else 0 + for rowb in range(low_rowb, matrix.mt): + handle = matrix.handle(rowb, colb) + + # Host task to read the block and compute norm + def compute_block_norm(h_block): + nonlocal norm_sq + norm_sq += np.sum(h_block * h_block) + + with ctx.task(stf.exec_place.host(), handle.read()) as t: + # Synchronize the stream before reading data + cp.cuda.runtime.streamSynchronize(t.stream_ptr()) + + h_block = cai_to_numpy(t.get_arg_cai(0)) + compute_block_norm(h_block) + + return np.sqrt(norm_sq) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Tiled POTRI (matrix inversion via Cholesky) with CUDA STF') + parser.add_argument('N', type=int, nargs='?', default=512, help='Matrix size (default: 512)') + parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)') + parser.add_argument('--check', action='store_true', help='Check result (slower)') + args = parser.parse_args() + + N = args.N + NB = args.NB + check_result = args.check + + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" + + print("="*60) + print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy") + print("="*60) + print(f"Matrix size: {N}x{N}") + print(f"Block size: {NB}x{NB}") + print(f"Number of blocks: {N//NB}x{N//NB}") + print(f"Check result: {check_result}") + print("="*60) + + # Create STF context + ctx = stf.context() + + # Create matrices + A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") + + if check_result: + Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") + + print("\n" + "="*60) + print("Initializing matrices...") + print("="*60) + + # Hilbert matrix + diagonal dominance for numerical stability + def hilbert(row, col): + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row) + + A.fill(hilbert) + if check_result: + Aref.fill(hilbert) + + # Measure performance + import time + start_time = time.time() + + print("\n" + "="*60) + print("Performing POTRI (inversion via Cholesky)...") + print("="*60) + + # Step 1: Cholesky factorization A = L*L^T + PDPOTRF(ctx, A, uplo='L') + + # Step 2: Triangular inversion L^(-1) + PDTRTRI(ctx, A, uplo='L', diag='N') + + # Step 3: Compute A^(-1) = L^(-T) * L^(-1) + PDLAUUM(ctx, A, uplo='L') + + if check_result: + print("\n" + "="*60) + print("Verifying result...") + print("="*60) + + # Create test vector B + B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri") + Bref_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri") + + def rhs_vals(row, col): + return 1.0 * (row + 1) + + B_potri.fill(rhs_vals) + Bref_potri.fill(rhs_vals) + + # Compute norm of B + b_norm = compute_norm(ctx, Bref_potri) + + # Create temporary matrix for result + B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp") + + def zero_vals(row, col): + return 0.0 + + B_tmp.fill(zero_vals) + + # Compute B_tmp = A^(-1) * B + PDSYMM(ctx, A, B_potri, B_tmp, side='L', uplo='L', alpha=1.0, beta=0.0) + + # Compute residual: Bref = Aref * B_tmp - Bref + PDGEMM(ctx, Aref, B_tmp, Bref_potri, transa='N', transb='N', alpha=1.0, beta=-1.0) + + # Compute residual norm + res_norm = compute_norm(ctx, Bref_potri) + + print("\n" + "="*60) + print("Finalizing STF context...") + print("="*60) + ctx.finalize() + + end_time = time.time() + elapsed_ms = (end_time - start_time) * 1000.0 + + # Compute FLOPS for POTRI + # POTRF: (1/3) * N^3 + # TRTRI: (1/3) * N^3 + # LAUUM: (1/3) * N^3 + # Total: N^3 + flops = float(N) ** 3 + gflops = flops / (elapsed_ms / 1000.0) / 1e9 + + print("\n" + "="*60) + print("Results") + print("="*60) + print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms") + print(f"[POTRI] Performance: {gflops:.2f} GFLOPS") + + if check_result: + residual = res_norm / b_norm + print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}") + print(f"[POTRI] ||B||: {b_norm:.6e}") + print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}") + + if residual < 0.01: + print("\n✅ Algorithm converged successfully!") + return 0 + else: + print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)") + return 1 + + print("="*60) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) + From 80e1085ce4f3507df4e2e5ef9dc250c87f0e399f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 9 Oct 2025 15:38:30 +0200 Subject: [PATCH 198/221] clang-format --- .../cuda_cccl/tests/stf/example_cholesky.py | 229 +++++++----------- 1 file changed, 82 insertions(+), 147 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py index faef64d1d73..1dac4b5a4fa 100755 --- a/python/cuda_cccl/tests/stf/example_cholesky.py +++ b/python/cuda_cccl/tests/stf/example_cholesky.py @@ -71,6 +71,28 @@ def cai_to_numpy(cai_dict): return arr +class BlockRef: + """Reference to a specific block in a tiled matrix.""" + + def __init__(self, matrix, row, col): + self.matrix = matrix + self.row = row + self.col = col + self._handle = matrix.handle(row, col) + self._devid = matrix.get_preferred_devid(row, col) + + def handle(self): + """Get the STF logical data handle for this block.""" + return self._handle + + def devid(self): + """Get the preferred device ID for this block.""" + return self._devid + + def __repr__(self): + return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])" + + class TiledMatrix: """ Tiled matrix class that splits a matrix into blocks for parallel processing. @@ -165,6 +187,10 @@ def handle(self, row, col): """Get the logical data handle for block (row, col)""" return self.handles[(row, col)] + def block(self, row, col): + """Get a BlockRef for block (row, col)""" + return BlockRef(self, row, col) + def _get_index(self, row, col): """Convert (row, col) to linear index in tiled storage""" # Find which tile contains this element @@ -213,37 +239,19 @@ def fill(self, func): # BLAS/LAPACK operations wrapped in STF tasks -def DPOTRF(ctx, A, row, col): - """Cholesky factorization of block (row, col) using CUSOLVER""" - handle = A.handle(row, col) - devid = A.get_preferred_devid(row, col) - - with ctx.task(stf.exec_place.device(devid), handle.rw()) as t: +def DPOTRF(ctx, a): + """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: d_block = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): d_block[:] = cp.linalg.cholesky(d_block) -def DTRSM( - ctx, - A, - a_row, - a_col, - B, - b_row, - b_col, - side="L", - uplo="L", - transa="T", - diag="N", - alpha=1.0, -): +def DTRSM(ctx, a, b, side="L", uplo="L", transa="T", diag="N", alpha=1.0): """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}""" - handle_a = A.handle(a_row, a_col) - handle_b = B.handle(b_row, b_col) - devid = B.get_preferred_devid(b_row, b_col) - - with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t: + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: d_a, d_b = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): if side == "L": @@ -266,30 +274,13 @@ def DTRSM( d_b *= alpha -def DGEMM( - ctx, - A, - a_row, - a_col, - B, - b_row, - b_col, - C, - c_row, - c_col, - transa="N", - transb="N", - alpha=1.0, - beta=1.0, -): +def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0): """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C""" - handle_a = A.handle(a_row, a_col) - handle_b = B.handle(b_row, b_col) - handle_c = C.handle(c_row, c_col) - devid = C.get_preferred_devid(c_row, c_col) - with ctx.task( - stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw() + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), ) as t: d_a, d_b, d_c = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): @@ -304,15 +295,11 @@ def DGEMM( d_c[:] = alpha * (op_a @ op_b) + beta * d_c -def DSYRK( - ctx, A, a_row, a_col, C, c_row, c_col, uplo="L", trans="N", alpha=1.0, beta=1.0 -): +def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0): """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" - handle_a = A.handle(a_row, a_col) - handle_c = C.handle(c_row, c_col) - devid = C.get_preferred_devid(c_row, c_col) - - with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t: + with ctx.task( + stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw() + ) as t: d_a, d_c = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): op_a = d_a.T if trans != "N" else d_a @@ -340,18 +327,14 @@ def PDPOTRF(ctx, A): for k in range(nblocks): # Factor diagonal block - DPOTRF(ctx, A, k, k) + DPOTRF(ctx, A.block(k, k)) # Solve triangular systems for blocks in column k for row in range(k + 1, nblocks): DTRSM( ctx, - A, - k, - k, - A, - row, - k, + A.block(k, k), + A.block(row, k), side="R", uplo="L", transa="T", @@ -363,15 +346,9 @@ def PDPOTRF(ctx, A): for col in range(k + 1, row): DGEMM( ctx, - A, - row, - k, - A, - col, - k, - A, - row, - col, + A.block(row, k), + A.block(col, k), + A.block(row, col), transa="N", transb="T", alpha=-1.0, @@ -380,7 +357,13 @@ def PDPOTRF(ctx, A): # Symmetric rank-k update of diagonal block DSYRK( - ctx, A, row, k, A, row, row, uplo="L", trans="N", alpha=-1.0, beta=1.0 + ctx, + A.block(row, k), + A.block(row, row), + uplo="L", + trans="N", + alpha=-1.0, + beta=1.0, ) print("[PDPOTRF] Completed") @@ -399,12 +382,8 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): for n in range(B.nt): DTRSM( ctx, - A, - k, - k, - B, - k, - n, + A.block(k, k), + B.block(k, n), side="L", uplo="L", transa="N", @@ -415,15 +394,9 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): for n in range(B.nt): DGEMM( ctx, - A, - m, - k, - B, - k, - n, - B, - m, - n, + A.block(m, k), + B.block(k, n), + B.block(m, n), transa="N", transb="N", alpha=-1.0, @@ -433,15 +406,12 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): # Backward substitution for k in range(B.mt): lalpha = alpha if k == 0 else 1.0 + row_idx = B.mt - k - 1 for n in range(B.nt): DTRSM( ctx, - A, - B.mt - k - 1, - B.mt - k - 1, - B, - B.mt - k - 1, - n, + A.block(row_idx, row_idx), + B.block(row_idx, n), side="L", uplo="L", transa="T", @@ -449,18 +419,13 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): alpha=lalpha, ) for m in range(k + 1, B.mt): + m_idx = B.mt - 1 - m for n in range(B.nt): DGEMM( ctx, - A, - B.mt - k - 1, - B.mt - 1 - m, - B, - B.mt - k - 1, - n, - B, - B.mt - 1 - m, - n, + A.block(row_idx, m_idx), + B.block(row_idx, n), + B.block(m_idx, n), transa="T", transb="N", alpha=-1.0, @@ -513,15 +478,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): # Just scale C DGEMM( ctx, - A, - 0, - 0, - B, - 0, - 0, - C, - m, - n, + A.block(0, 0), + B.block(0, 0), + C.block(m, n), transa=transa, transb=transb, alpha=0.0, @@ -533,15 +492,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): zbeta = beta if k == 0 else 1.0 DGEMM( ctx, - A, - m, - k, - B, - k, - n, - C, - m, - n, + A.block(m, k), + B.block(k, n), + C.block(m, n), transa="N", transb="N", alpha=alpha, @@ -552,15 +505,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): zbeta = beta if k == 0 else 1.0 DGEMM( ctx, - A, - m, - k, - B, - n, - k, - C, - m, - n, + A.block(m, k), + B.block(n, k), + C.block(m, n), transa="N", transb="T", alpha=alpha, @@ -572,15 +519,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): zbeta = beta if k == 0 else 1.0 DGEMM( ctx, - A, - k, - m, - B, - k, - n, - C, - m, - n, + A.block(k, m), + B.block(k, n), + C.block(m, n), transa="T", transb="N", alpha=alpha, @@ -591,15 +532,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): zbeta = beta if k == 0 else 1.0 DGEMM( ctx, - A, - k, - m, - B, - n, - k, - C, - m, - n, + A.block(k, m), + B.block(n, k), + C.block(m, n), transa="T", transb="T", alpha=alpha, From 4c1551ab80c84fc50895c500f6cb2b5334c4a402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Thu, 9 Oct 2025 21:45:21 +0200 Subject: [PATCH 199/221] how changes to numba-cuda have been merged --- python/cuda_cccl/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 14561449098..f8dd51338d8 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.2.3", "cuda-core", - "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory", + "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@main", ] dynamic = ["version"] From de333b28f2599deb8a61dd7a416633c3700da6c6 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Fri, 14 Nov 2025 13:48:06 -0500 Subject: [PATCH 200/221] Fix CI precommit --- python/cuda_cccl/CMakeLists.txt | 63 +- .../cuda_cccl/tests/stf/example_cholesky.py | 2 +- python/cuda_cccl/tests/stf/example_potri.py | 595 ++++++++++++------ 3 files changed, 442 insertions(+), 218 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 2c8cc0e43e4..e8ce363b6e7 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -25,10 +25,10 @@ cccl_build_compiler_targets() # Build and install C++ library first set(CCCL_ENABLE_C_PARALLEL ON) -set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory) -set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build -set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build -set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features +set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory) +set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build +set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build +set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) @@ -60,13 +60,13 @@ file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl") # Install version-specific binaries install( - TARGETS cccl.c.experimental.stf - DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl + TARGETS cccl.c.experimental.stf + DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl ) install( - TARGETS cccl.c.parallel - DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl + TARGETS cccl.c.parallel + DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl ) # Build and install Cython extension @@ -138,20 +138,30 @@ add_custom_target( ) message(STATUS "STF Using Cython ${CYTHON_VERSION}") -set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx") +set( + stf_pyx_source_file + "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx" +) set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c") set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep") add_custom_command( - OUTPUT "${_stf_generated_extension_src}" - COMMAND "${Python3_EXECUTABLE}" -m cython - ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src} - DEPENDS "${stf_pyx_source_file}" - DEPFILE "${_stf_depfile}" - COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" + OUTPUT "${_stf_generated_extension_src}" + COMMAND "${Python3_EXECUTABLE}" -m cython + ARGS + ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file + ${_stf_generated_extension_src} + DEPENDS "${stf_pyx_source_file}" + DEPFILE "${_stf_depfile}" + COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" ) -set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE) -add_custom_target(cythonize_stf_bindings_impl ALL - DEPENDS "${_stf_generated_extension_src}" +set_source_files_properties( + "${_stf_generated_extension_src}" + PROPERTIES GENERATED TRUE +) +add_custom_target( + cythonize_stf_bindings_impl + ALL + DEPENDS "${_stf_generated_extension_src}" ) python3_add_library( @@ -170,10 +180,21 @@ target_link_libraries( ) set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") -Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}") +python3_add_library( + _stf_bindings_impl + MODULE + WITH_SOABI + "${_stf_generated_extension_src}" +) add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl) -target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver) -set_target_properties(_stf_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") +target_link_libraries( + _stf_bindings_impl + PRIVATE cccl.c.experimental.stf CUDA::cuda_driver +) +set_target_properties( + _stf_bindings_impl + PROPERTIES INSTALL_RPATH "$ORIGIN/cccl" +) install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR}) install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR}) diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py index 1dac4b5a4fa..7eded4a20b7 100755 --- a/python/cuda_cccl/tests/stf/example_cholesky.py +++ b/python/cuda_cccl/tests/stf/example_cholesky.py @@ -41,7 +41,7 @@ def get_cupy_arrays(task): try: arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) idx += 1 - except: + except Exception: break return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py index 0be70fc9338..1e3c721a9c1 100644 --- a/python/cuda_cccl/tests/stf/example_potri.py +++ b/python/cuda_cccl/tests/stf/example_potri.py @@ -15,14 +15,17 @@ """ import sys -import numpy as np + import cupy as cp +import numpy as np from cupyx.scipy import linalg as cp_linalg + import cuda.stf as stf class CAIWrapper: """Wrapper to expose CUDA Array Interface dict as a proper CAI object.""" + def __init__(self, cai_dict): self.__cuda_array_interface__ = cai_dict @@ -30,7 +33,7 @@ def __init__(self, cai_dict): def get_cupy_arrays(task): """ Get all CuPy arrays from STF task arguments. - + Usage: d_a, d_b, d_c = get_cupy_arrays(t) """ @@ -40,7 +43,7 @@ def get_cupy_arrays(task): try: arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) idx += 1 - except: + except Exception: break return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None @@ -48,45 +51,46 @@ def get_cupy_arrays(task): def cai_to_numpy(cai_dict): """Convert CUDA Array Interface dict to NumPy array (for host memory).""" import ctypes - + # Extract CAI fields - data_ptr, readonly = cai_dict['data'] - shape = cai_dict['shape'] - typestr = cai_dict['typestr'] - + data_ptr, readonly = cai_dict["data"] + shape = cai_dict["shape"] + typestr = cai_dict["typestr"] + # Convert typestr to NumPy dtype dtype = np.dtype(typestr) - + # Calculate total size in bytes itemsize = dtype.itemsize size = np.prod(shape) * itemsize - + # Create ctypes buffer from pointer buffer = (ctypes.c_byte * size).from_address(data_ptr) - + # Create NumPy array from buffer arr = np.frombuffer(buffer, dtype=dtype).reshape(shape) - + return arr class BlockRef: """Reference to a specific block in a tiled matrix.""" + def __init__(self, matrix, row, col): self.matrix = matrix self.row = row self.col = col self._handle = matrix.handle(row, col) self._devid = matrix.get_preferred_devid(row, col) - + def handle(self): """Get the STF logical data handle for this block.""" return self._handle - + def devid(self): """Get the preferred device ID for this block.""" return self._devid - + def __repr__(self): return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])" @@ -97,25 +101,39 @@ class TiledMatrix: Each block is managed as an STF logical data object. Uses tiled storage format for contiguous blocks. """ - def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols, - is_symmetric=False, symbol="matrix", dtype=np.float64): + + def __init__( + self, + ctx, + nrows, + ncols, + blocksize_rows, + blocksize_cols, + is_symmetric=False, + symbol="matrix", + dtype=np.float64, + ): self.ctx = ctx self.symbol = symbol self.dtype = dtype self.sym_matrix = is_symmetric - + self.m = nrows self.n = ncols self.mb = blocksize_rows self.nb = blocksize_cols - - assert self.m % self.mb == 0, f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}" - assert self.n % self.nb == 0, f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}" - + + assert self.m % self.mb == 0, ( + f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}" + ) + assert self.n % self.nb == 0, ( + f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}" + ) + # Number of blocks self.mt = self.m // self.mb self.nt = self.n // self.nb - + # Allocate pinned host memory for faster transfers (in tiled format) self.h_array = cp.cuda.alloc_pinned_memory( self.m * self.n * np.dtype(dtype).itemsize @@ -123,17 +141,21 @@ def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols, self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape( self.m, self.n ) - + # Dictionary to store logical data handles for each block self.handles = {} - + # Determine device layout self.ndevs = cp.cuda.runtime.getDeviceCount() self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) - - print(f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}") - print(f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid") - + + print( + f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}" + ) + print( + f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid" + ) + def _compute_device_grid(self, ndevs): """Compute 2D device grid dimensions (as close to square as possible)""" grid_p = 1 @@ -143,19 +165,19 @@ def _compute_device_grid(self, ndevs): grid_p = a grid_q = ndevs // a return grid_p, grid_q - + def get_preferred_devid(self, row, col): """Get preferred device ID for a given block using cyclic distribution""" return (row % self.grid_p) + (col % self.grid_q) * self.grid_p - + def handle(self, row, col): """Get the logical data handle for a block.""" return self.handles[(row, col)] - + def block(self, row, col): """Get a BlockRef for block (row, col)""" return BlockRef(self, row, col) - + def _get_index(self, row, col): """Convert (row, col) to linear index in tiled storage""" tile_row = row // self.mb @@ -164,7 +186,7 @@ def _get_index(self, row, col): tile_start = (tile_row + self.mt * tile_col) * tile_size offset = (row % self.mb) + (col % self.nb) * self.mb return tile_start + offset - + def _get_block_h(self, brow, bcol): """Get a view of the host data for block (brow, bcol)""" # For tiled storage, blocks are stored contiguously @@ -172,7 +194,7 @@ def _get_block_h(self, brow, bcol): end_idx = start_idx + self.mb * self.nb flat_view = self.h_array_np.ravel() return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) - + def fill(self, func): """ Fill the matrix blocks using a function func(row, col) -> value. @@ -189,10 +211,10 @@ def fill(self, func): row = lrow + rowb * self.mb col = lcol + colb * self.nb h_block[lrow, lcol] = func(row, col) - + handle = self.ctx.logical_data(h_block) handle.set_symbol(f"{self.symbol}_{rowb}_{colb}") - + self.handles[(rowb, colb)] = handle @@ -200,6 +222,7 @@ def fill(self, func): # Block-level operations (BLAS/LAPACK) # ============================================================================ + def DPOTRF(ctx, a): """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)""" with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: @@ -208,13 +231,15 @@ def DPOTRF(ctx, a): d_block[:] = cp.linalg.cholesky(d_block) -def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0): +def DTRSM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0): """Triangular solve: B = alpha * op(A)^(-1) * B""" - with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t: + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: d_a, d_b = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): - lower = (uplo == 'L') - trans = (transa != 'N') + lower = uplo == "L" + trans = transa != "N" result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans) if alpha != 1.0: d_b[:] = alpha * result @@ -222,27 +247,34 @@ def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0): d_b[:] = result -def DTRTRI(ctx, a, uplo='L', diag='N'): +def DTRTRI(ctx, a, uplo="L", diag="N"): """Triangular matrix inversion: A = A^(-1)""" with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: d_block = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): - lower = (uplo == 'L') - unit_diagonal = (diag == 'U') + lower = uplo == "L" + unit_diagonal = diag == "U" # CuPy doesn't have trtri directly, use solve with identity n = d_block.shape[0] identity = cp.eye(n, dtype=d_block.dtype) - d_block[:] = cp_linalg.solve_triangular(d_block, identity, lower=lower, unit_diagonal=unit_diagonal) + d_block[:] = cp_linalg.solve_triangular( + d_block, identity, lower=lower, unit_diagonal=unit_diagonal + ) -def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0): +def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0): """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C""" - with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t: + with ctx.task( + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), + ) as t: d_a, d_b, d_c = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): - op_a = d_a.T if transa != 'N' else d_a - op_b = d_b.T if transb != 'N' else d_b - + op_a = d_a.T if transa != "N" else d_a + op_b = d_b.T if transb != "N" else d_b + if beta == 0.0: d_c[:] = alpha * (op_a @ op_b) elif beta == 1.0: @@ -251,13 +283,15 @@ def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0): d_c[:] = alpha * (op_a @ op_b) + beta * d_c -def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0): +def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0): """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" - with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()) as t: + with ctx.task( + stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw() + ) as t: d_a, d_c = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): - op_a = d_a.T if trans != 'N' else d_a - + op_a = d_a.T if trans != "N" else d_a + if beta == 0.0: d_c[:] = alpha * (op_a @ op_a.T) elif beta == 1.0: @@ -266,48 +300,55 @@ def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0): d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c -def DTRMM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0): +def DTRMM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0): """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')""" - with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t: + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: d_a, d_b = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): - lower = (uplo == 'L') - trans = (transa != 'N') - + lower = uplo == "L" + trans = transa != "N" + # Extract triangle from A if lower: tri_a = cp.tril(d_a) else: tri_a = cp.triu(d_a) - + if trans: tri_a = tri_a.T - - if side == 'L': + + if side == "L": d_b[:] = alpha * (tri_a @ d_b) else: # side == 'R' d_b[:] = alpha * (d_b @ tri_a) -def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0): +def DSYMM(ctx, a, b, c, side="L", uplo="L", alpha=1.0, beta=1.0): """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R') where A is symmetric.""" - with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t: + with ctx.task( + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), + ) as t: d_a, d_b, d_c = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): # Reconstruct full symmetric matrix from lower/upper triangle - if uplo == 'L': + if uplo == "L": # Lower triangle is stored sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T else: # Upper triangle is stored sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T - - if side == 'L': + + if side == "L": result = alpha * (sym_a @ d_b) else: # side == 'R' result = alpha * (d_b @ sym_a) - + if beta == 0.0: d_c[:] = result elif beta == 1.0: @@ -320,61 +361,115 @@ def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0): # Tiled operations # ============================================================================ -def PDPOTRF(ctx, A, uplo='L'): + +def PDPOTRF(ctx, A, uplo="L"): """Parallel tiled Cholesky factorization""" - print(f"\n[PDPOTRF] Starting Cholesky factorization...") - assert uplo == 'L', "Only lower triangular factorization supported" - + print("\n[PDPOTRF] Starting Cholesky factorization...") + assert uplo == "L", "Only lower triangular factorization supported" + for k in range(A.nt): # Factorize diagonal block DPOTRF(ctx, A.block(k, k)) - + # Update column below diagonal for m in range(k + 1, A.mt): - DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='T', diag='N', alpha=1.0) - + DTRSM( + ctx, + A.block(k, k), + A.block(m, k), + side="R", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + # Update trailing submatrix for n in range(k + 1, A.nt): - DSYRK(ctx, A.block(n, k), A.block(n, n), uplo='L', trans='N', alpha=-1.0, beta=1.0) - + DSYRK( + ctx, + A.block(n, k), + A.block(n, n), + uplo="L", + trans="N", + alpha=-1.0, + beta=1.0, + ) + for m in range(n + 1, A.mt): - DGEMM(ctx, A.block(m, k), A.block(n, k), A.block(m, n), transa='N', transb='T', alpha=-1.0, beta=1.0) - - print(f"[PDPOTRF] Completed") + DGEMM( + ctx, + A.block(m, k), + A.block(n, k), + A.block(m, n), + transa="N", + transb="T", + alpha=-1.0, + beta=1.0, + ) + + print("[PDPOTRF] Completed") -def PDTRTRI(ctx, A, uplo='L', diag='N'): +def PDTRTRI(ctx, A, uplo="L", diag="N"): """Parallel tiled triangular matrix inversion""" - print(f"\n[PDTRTRI] Starting triangular inversion...") - assert uplo == 'L', "Only lower triangular inversion supported" - + print("\n[PDTRTRI] Starting triangular inversion...") + assert uplo == "L", "Only lower triangular inversion supported" + for k in range(A.nt): # Step 1: Update A[m,k] for m > k for m in range(k + 1, A.mt): - DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='N', diag=diag, alpha=-1.0) - + DTRSM( + ctx, + A.block(k, k), + A.block(m, k), + side="R", + uplo="L", + transa="N", + diag=diag, + alpha=-1.0, + ) + # Step 2: Update A[m,n] for m > k, n < k for m in range(k + 1, A.mt): for n in range(k): - DGEMM(ctx, A.block(m, k), A.block(k, n), A.block(m, n), transa='N', transb='N', alpha=1.0, beta=1.0) - + DGEMM( + ctx, + A.block(m, k), + A.block(k, n), + A.block(m, n), + transa="N", + transb="N", + alpha=1.0, + beta=1.0, + ) + # Step 3: Update A[k,n] for n < k for n in range(k): - DTRSM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='N', diag=diag, alpha=1.0) - + DTRSM( + ctx, + A.block(k, k), + A.block(k, n), + side="L", + uplo="L", + transa="N", + diag=diag, + alpha=1.0, + ) + # Step 4: Invert diagonal block A[k,k] DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag) - - print(f"[PDTRTRI] Completed") + + print("[PDTRTRI] Completed") -def DLAAUM(ctx, a, uplo='L'): +def DLAAUM(ctx, a, uplo="L"): """Compute A^T * A for a triangular block (lauum operation)""" with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: d_block = get_cupy_arrays(t) with cp.cuda.ExternalStream(t.stream_ptr()): # lauum: compute L * L^T for lower triangular L - if uplo == 'L': + if uplo == "L": L = cp.tril(d_block) d_block[:] = L @ L.T else: @@ -382,248 +477,356 @@ def DLAAUM(ctx, a, uplo='L'): d_block[:] = U.T @ U -def PDLAUUM(ctx, A, uplo='L'): +def PDLAUUM(ctx, A, uplo="L"): """Parallel tiled computation of A^T * A for lower triangular A""" - print(f"\n[PDLAUUM] Starting LAUUM (A^T * A)...") - assert uplo == 'L', "Only lower triangular LAUUM supported" - + print("\n[PDLAUUM] Starting LAUUM (A^T * A)...") + assert uplo == "L", "Only lower triangular LAUUM supported" + for k in range(A.mt): # Step 1: Update off-diagonal blocks for n in range(k): # Update A[n,n] with A[k,n]^T * A[k,n] - DSYRK(ctx, A.block(k, n), A.block(n, n), uplo='L', trans='T', alpha=1.0, beta=1.0) - + DSYRK( + ctx, + A.block(k, n), + A.block(n, n), + uplo="L", + trans="T", + alpha=1.0, + beta=1.0, + ) + # Update A[m,n] with A[k,m]^T * A[k,n] for m in range(n + 1, k): - DGEMM(ctx, A.block(k, m), A.block(k, n), A.block(m, n), transa='T', transb='N', alpha=1.0, beta=1.0) - + DGEMM( + ctx, + A.block(k, m), + A.block(k, n), + A.block(m, n), + transa="T", + transb="N", + alpha=1.0, + beta=1.0, + ) + # Step 2: Update A[k,n] = A[k,k]^T * A[k,n] for n in range(k): - DTRMM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='T', diag='N', alpha=1.0) - + DTRMM( + ctx, + A.block(k, k), + A.block(k, n), + side="L", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k] DLAAUM(ctx, A.block(k, k), uplo=uplo) - - print(f"[PDLAUUM] Completed") + + print("[PDLAUUM] Completed") -def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0): +def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): """Parallel tiled matrix multiplication""" - print(f"\n[PDGEMM] Starting matrix multiplication...") - + print("\n[PDGEMM] Starting matrix multiplication...") + for m in range(C.mt): for n in range(C.nt): - inner_k = A.nt if transa == 'N' else A.mt - + inner_k = A.nt if transa == "N" else A.mt + if alpha == 0.0 or inner_k == 0: # Just scale C - DGEMM(ctx, A.block(0, 0), B.block(0, 0), C.block(m, n), transa=transa, transb=transb, alpha=0.0, beta=beta) - elif transa == 'N': - if transb == 'N': + DGEMM( + ctx, + A.block(0, 0), + B.block(0, 0), + C.block(m, n), + transa=transa, + transb=transb, + alpha=0.0, + beta=beta, + ) + elif transa == "N": + if transb == "N": for k in range(A.nt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + C.block(m, n), + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: for k in range(A.nt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A.block(m, k), B.block(n, k), C.block(m, n), transa='N', transb='T', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A.block(m, k), + B.block(n, k), + C.block(m, n), + transa="N", + transb="T", + alpha=alpha, + beta=zbeta, + ) else: # transa in ['T', 'C'] - if transb == 'N': + if transb == "N": for k in range(A.mt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A.block(k, m), + B.block(k, n), + C.block(m, n), + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: for k in range(A.mt): zbeta = beta if k == 0 else 1.0 - DGEMM(ctx, A.block(k, m), B.block(n, k), C.block(m, n), transa='T', transb='T', alpha=alpha, beta=zbeta) - - print(f"[PDGEMM] Completed") + DGEMM( + ctx, + A.block(k, m), + B.block(n, k), + C.block(m, n), + transa="T", + transb="T", + alpha=alpha, + beta=zbeta, + ) + print("[PDGEMM] Completed") -def PDSYMM(ctx, A, B, C, side='L', uplo='L', alpha=1.0, beta=1.0): + +def PDSYMM(ctx, A, B, C, side="L", uplo="L", alpha=1.0, beta=1.0): """Parallel tiled symmetric matrix multiplication""" - print(f"\n[PDSYMM] Starting symmetric matrix multiplication...") - + print("\n[PDSYMM] Starting symmetric matrix multiplication...") + for m in range(C.mt): for n in range(C.nt): - if side == 'L': - if uplo == 'L': + if side == "L": + if uplo == "L": for k in range(C.mt): zbeta = beta if k == 0 else 1.0 if k < m: - DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + C.block(m, n), + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: if k == m: - DSYMM(ctx, A.block(k, k), B.block(k, n), C.block(m, n), side=side, uplo=uplo, alpha=alpha, beta=zbeta) + DSYMM( + ctx, + A.block(k, k), + B.block(k, n), + C.block(m, n), + side=side, + uplo=uplo, + alpha=alpha, + beta=zbeta, + ) else: - DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta) + DGEMM( + ctx, + A.block(k, m), + B.block(k, n), + C.block(m, n), + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) else: # side == 'R' # Similar logic for right multiplication pass - - print(f"[PDSYMM] Completed") + + print("[PDSYMM] Completed") def compute_norm(ctx, matrix): """Compute Frobenius norm of matrix using host tasks""" norm_sq = 0.0 - + for colb in range(matrix.nt): low_rowb = colb if matrix.sym_matrix else 0 for rowb in range(low_rowb, matrix.mt): handle = matrix.handle(rowb, colb) - + # Host task to read the block and compute norm def compute_block_norm(h_block): nonlocal norm_sq norm_sq += np.sum(h_block * h_block) - + with ctx.task(stf.exec_place.host(), handle.read()) as t: # Synchronize the stream before reading data cp.cuda.runtime.streamSynchronize(t.stream_ptr()) - + h_block = cai_to_numpy(t.get_arg_cai(0)) compute_block_norm(h_block) - + return np.sqrt(norm_sq) def main(): import argparse - - parser = argparse.ArgumentParser(description='Tiled POTRI (matrix inversion via Cholesky) with CUDA STF') - parser.add_argument('N', type=int, nargs='?', default=512, help='Matrix size (default: 512)') - parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)') - parser.add_argument('--check', action='store_true', help='Check result (slower)') + + parser = argparse.ArgumentParser( + description="Tiled POTRI (matrix inversion via Cholesky) with CUDA STF" + ) + parser.add_argument( + "N", type=int, nargs="?", default=512, help="Matrix size (default: 512)" + ) + parser.add_argument( + "NB", type=int, nargs="?", default=128, help="Block size (default: 128)" + ) + parser.add_argument("--check", action="store_true", help="Check result (slower)") args = parser.parse_args() - + N = args.N NB = args.NB check_result = args.check - + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" - - print("="*60) + + print("=" * 60) print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy") - print("="*60) + print("=" * 60) print(f"Matrix size: {N}x{N}") print(f"Block size: {NB}x{NB}") - print(f"Number of blocks: {N//NB}x{N//NB}") + print(f"Number of blocks: {N // NB}x{N // NB}") print(f"Check result: {check_result}") - print("="*60) - + print("=" * 60) + # Create STF context ctx = stf.context() - + # Create matrices A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") - + if check_result: Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Initializing matrices...") - print("="*60) - + print("=" * 60) + # Hilbert matrix + diagonal dominance for numerical stability def hilbert(row, col): return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row) - + A.fill(hilbert) if check_result: Aref.fill(hilbert) - + # Measure performance import time + start_time = time.time() - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Performing POTRI (inversion via Cholesky)...") - print("="*60) - + print("=" * 60) + # Step 1: Cholesky factorization A = L*L^T - PDPOTRF(ctx, A, uplo='L') - + PDPOTRF(ctx, A, uplo="L") + # Step 2: Triangular inversion L^(-1) - PDTRTRI(ctx, A, uplo='L', diag='N') - + PDTRTRI(ctx, A, uplo="L", diag="N") + # Step 3: Compute A^(-1) = L^(-T) * L^(-1) - PDLAUUM(ctx, A, uplo='L') - + PDLAUUM(ctx, A, uplo="L") + if check_result: - print("\n" + "="*60) + print("\n" + "=" * 60) print("Verifying result...") - print("="*60) - + print("=" * 60) + # Create test vector B B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri") - Bref_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri") - + Bref_potri = TiledMatrix( + ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri" + ) + def rhs_vals(row, col): return 1.0 * (row + 1) - + B_potri.fill(rhs_vals) Bref_potri.fill(rhs_vals) - + # Compute norm of B b_norm = compute_norm(ctx, Bref_potri) - + # Create temporary matrix for result B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp") - + def zero_vals(row, col): return 0.0 - + B_tmp.fill(zero_vals) - + # Compute B_tmp = A^(-1) * B - PDSYMM(ctx, A, B_potri, B_tmp, side='L', uplo='L', alpha=1.0, beta=0.0) - + PDSYMM(ctx, A, B_potri, B_tmp, side="L", uplo="L", alpha=1.0, beta=0.0) + # Compute residual: Bref = Aref * B_tmp - Bref - PDGEMM(ctx, Aref, B_tmp, Bref_potri, transa='N', transb='N', alpha=1.0, beta=-1.0) - + PDGEMM( + ctx, Aref, B_tmp, Bref_potri, transa="N", transb="N", alpha=1.0, beta=-1.0 + ) + # Compute residual norm res_norm = compute_norm(ctx, Bref_potri) - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Finalizing STF context...") - print("="*60) + print("=" * 60) ctx.finalize() - + end_time = time.time() elapsed_ms = (end_time - start_time) * 1000.0 - + # Compute FLOPS for POTRI # POTRF: (1/3) * N^3 - # TRTRI: (1/3) * N^3 + # TRTRI: (1/3) * N^3 # LAUUM: (1/3) * N^3 # Total: N^3 flops = float(N) ** 3 gflops = flops / (elapsed_ms / 1000.0) / 1e9 - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("Results") - print("="*60) + print("=" * 60) print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms") print(f"[POTRI] Performance: {gflops:.2f} GFLOPS") - + if check_result: residual = res_norm / b_norm print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}") print(f"[POTRI] ||B||: {b_norm:.6e}") print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}") - + if residual < 0.01: print("\n✅ Algorithm converged successfully!") return 0 else: print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)") return 1 - - print("="*60) + + print("=" * 60) return 0 if __name__ == "__main__": sys.exit(main()) - From 9a5c265e80c09b518539a74e67c4a028525a5024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 24 Nov 2025 21:22:32 +0100 Subject: [PATCH 201/221] no need for numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 anymore --- python/cuda_cccl/cuda/stf/decorator.py | 3 --- python/cuda_cccl/tests/stf/test_decorator.py | 1 - python/cuda_cccl/tests/stf/test_fhe.py | 1 - python/cuda_cccl/tests/stf/test_fhe_decorator.py | 1 - python/cuda_cccl/tests/stf/test_numba.py | 1 - python/cuda_cccl/tests/stf/test_pytorch.py | 1 - python/cuda_cccl/tests/stf/test_stencil_decorator.py | 1 - python/cuda_cccl/tests/stf/test_token.py | 1 - 8 files changed, 10 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py index 65af9734f44..41bf71c6316 100644 --- a/python/cuda_cccl/cuda/stf/decorator.py +++ b/python/cuda_cccl/cuda/stf/decorator.py @@ -1,10 +1,7 @@ -import numba from numba import cuda from cuda.stf import context, dep, exec_place -numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 - class stf_kernel_decorator: def __init__(self, pyfunc, jit_args, jit_kwargs): diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 16bc2539538..338c76f28a7 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -5,7 +5,6 @@ import cuda.stf as stf -numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index d0bbdd3d596..94aaa7210da 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -9,7 +9,6 @@ import cuda.stf as stf -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 571ff8013ea..79560dd25cf 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -9,7 +9,6 @@ import cuda.stf as cudastf -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index d15ae639bda..a2d30a7eb38 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -10,7 +10,6 @@ import cuda.stf as stf -numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index 001a7002d08..c4b337e801d 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -9,7 +9,6 @@ torch = pytest.importorskip("torch") -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 from cuda.stf._stf_bindings import ( # noqa: E402 diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index b4155c8b46b..07fa270ee17 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -4,7 +4,6 @@ import cuda.stf as cudastf -numba.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py index acef5e34f3e..04ceb920b6a 100644 --- a/python/cuda_cccl/tests/stf/test_token.py +++ b/python/cuda_cccl/tests/stf/test_token.py @@ -8,7 +8,6 @@ import cuda.stf as stf -numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 From e7e2adb2ace465f86db3b883a2da45d89b782995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Mon, 24 Nov 2025 21:44:53 +0100 Subject: [PATCH 202/221] Our numba-cuda fix is part of 0.21.0 --- python/cuda_cccl/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index c5600f0da11..1ef761e0924 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -31,8 +31,7 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.2.3", "cuda-core", - # FIXME - "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@main", + "numba-cuda>=0.21.0", "typing_extensions", ] From 39040a9e9e939f0e551d7092908feb94da36832d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 11:19:58 +0100 Subject: [PATCH 203/221] Minor doc fix --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 49ae71098af..848f0f1d5db 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -487,7 +487,7 @@ cudaStream_t stf_fence(stf_ctx_handle ctx); void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz); //! -//! \brief Create logical data handle from address with data place specification [PRIMARY API] +//! \brief Create logical data handle from address with data place specification //! //! Creates logical data handle from existing memory buffer, explicitly specifying where //! the memory is located (host, device, managed, etc.). This is the primary and recommended From 8f27fa2a278762007356b0f0f2feb9b1cc920fee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 11:42:09 +0100 Subject: [PATCH 204/221] Ensure matplotlib is only used if available --- .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 21 ++++++++++++++++--- .../tests/stf/test_fdtd_pytorch_simplified.py | 21 ++++++++++++++++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index a64845055ce..bfb2d7b3a56 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -1,7 +1,6 @@ import math from typing import Literal, Optional, Tuple -import matplotlib.pyplot as plt import numpy as np import torch import torch.cuda as tc @@ -10,10 +9,21 @@ context, ) +try: + import matplotlib.pyplot as plt + + has_matplotlib = True +except ImportError: + has_matplotlib = False + Plane = Literal["xy", "xz", "yz"] def show_slice(t3d, plane="xy", index=None): + """Display a 2D slice of a 3D tensor (requires matplotlib).""" + if not has_matplotlib: + return + # grab a 2D view if plane == "xy": idx = t3d.shape[2] // 2 if index is None else index @@ -201,7 +211,8 @@ def source(t: float, x: float, y: float, z: float) -> float: ): ez = t.tensor_arguments() print(f"{n}\t{ez[cx, cy, cz].item():.6e}") - show_slice(ez, plane="xy") + if has_matplotlib: + show_slice(ez, plane="xy") pass ctx.finalize() @@ -210,4 +221,8 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run FDTD simulation print("Running FDTD 3D PyTorch example...") - test_fdtd_3d_pytorch(timesteps=1000, output_freq=5) + output_freq = 5 if has_matplotlib else 0 + if not has_matplotlib and output_freq > 0: + print("Warning: matplotlib not available, running without visualization") + output_freq = 0 + test_fdtd_3d_pytorch(timesteps=1000, output_freq=output_freq) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index 24af8361162..5910d0978cd 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -1,7 +1,6 @@ import math from typing import Literal, Optional, Tuple -import matplotlib.pyplot as plt import numpy as np import torch @@ -9,10 +8,21 @@ context, ) +try: + import matplotlib.pyplot as plt + + has_matplotlib = True +except ImportError: + has_matplotlib = False + Plane = Literal["xy", "xz", "yz"] def show_slice(t3d, plane="xy", index=None): + """Display a 2D slice of a 3D tensor (requires matplotlib).""" + if not has_matplotlib: + return + # grab a 2D view if plane == "xy": idx = t3d.shape[2] // 2 if index is None else index @@ -202,7 +212,8 @@ def source(t: float, x: float, y: float, z: float) -> float: if output_freq > 0 and (n % output_freq) == 0: with ctx.pytorch_task(lez.read()) as (ez,): print(f"{n}\t{ez[cx, cy, cz].item():.6e}") - show_slice(ez, plane="xy") + if has_matplotlib: + show_slice(ez, plane="xy") ctx.finalize() @@ -210,4 +221,8 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run simplified FDTD simulation using pytorch_task print("Running FDTD simulation with pytorch_task syntax...") - test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5) + output_freq = 5 if has_matplotlib else 0 + if not has_matplotlib and output_freq > 0: + print("Warning: matplotlib not available, running without visualization") + output_freq = 0 + test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=output_freq) From 73ac963a7f58b97ae5a777edca1c3d6893002962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 12:08:49 +0100 Subject: [PATCH 205/221] Cleanup examples --- python/cuda_cccl/pyproject.toml | 2 +- python/cuda_cccl/tests/stf/test_context.py | 1 - python/cuda_cccl/tests/stf/test_decorator.py | 1 - .../cuda_cccl/tests/stf/test_fdtd_pytorch.py | 5 +- .../tests/stf/test_fdtd_pytorch_simplified.py | 5 +- python/cuda_cccl/tests/stf/test_fhe.py | 1 - .../cuda_cccl/tests/stf/test_fhe_decorator.py | 1 - python/cuda_cccl/tests/stf/test_numba.py | 49 ++------------ python/cuda_cccl/tests/stf/test_pytorch.py | 67 ++----------------- .../tests/stf/test_stencil_decorator.py | 4 +- python/cuda_cccl/tests/stf/test_token.py | 4 -- 11 files changed, 19 insertions(+), 121 deletions(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 1ef761e0924..4c90e1a3e99 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.2.3", "cuda-core", - "numba-cuda>=0.21.0", + "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@v0.21.0", "typing_extensions", ] diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py index f4a583de351..451c44aadb8 100644 --- a/python/cuda_cccl/tests/stf/test_context.py +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -72,5 +72,4 @@ def test_ctx3(): if __name__ == "__main__": - print("Running CUDASTF examples...") test_ctx3() diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 338c76f28a7..ce8fad1d69b 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -42,5 +42,4 @@ def test_decorator(use_graph): if __name__ == "__main__": - print("Running CUDASTF examples...") test_decorator(False) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py index bfb2d7b3a56..d550caba060 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -2,7 +2,9 @@ from typing import Literal, Optional, Tuple import numpy as np -import torch +import pytest + +torch = pytest.importorskip("torch") import torch.cuda as tc from cuda.stf._stf_bindings import ( @@ -220,7 +222,6 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run FDTD simulation - print("Running FDTD 3D PyTorch example...") output_freq = 5 if has_matplotlib else 0 if not has_matplotlib and output_freq > 0: print("Warning: matplotlib not available, running without visualization") diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py index 5910d0978cd..b786552b6b3 100644 --- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -2,7 +2,9 @@ from typing import Literal, Optional, Tuple import numpy as np -import torch +import pytest + +torch = pytest.importorskip("torch") from cuda.stf._stf_bindings import ( context, @@ -220,7 +222,6 @@ def source(t: float, x: float, y: float, z: float) -> float: if __name__ == "__main__": # Run simplified FDTD simulation using pytorch_task - print("Running FDTD simulation with pytorch_task syntax...") output_freq = 5 if has_matplotlib else 0 if not has_matplotlib and output_freq > 0: print("Warning: matplotlib not available, running without visualization") diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index 94aaa7210da..e613d37ea76 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -164,5 +164,4 @@ def test_fhe(): if __name__ == "__main__": - print("Running CUDASTF FHE example...") test_fhe() diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 79560dd25cf..969cbe08668 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -146,5 +146,4 @@ def test_fhe_decorator(): if __name__ == "__main__": - print("Running CUDASTF FHE decorator example...") test_fhe_decorator() diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index a2d30a7eb38..c0ca54375e7 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -41,10 +41,7 @@ def test_numba_graph(): # Verify results after finalize (data written back to host) # Expected: scale(2.0, 1.0) = 2.0 - if np.allclose(X, 2.0): - print("✅ Graph test: X values correct: all 2.0") - else: - print(f"❌ Graph test: X values incorrect: expected 2.0, got {X[:5]}...") + assert np.allclose(X, 2.0) def test_numba(): @@ -66,7 +63,6 @@ def test_numba(): with ctx.task(lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - print(nb_stream) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) @@ -84,44 +80,13 @@ def test_numba(): ctx.finalize() # Verify results after finalize (data written back to host) - print("Verifying results after finalize:") - # Expected values: # X: scale(2.0, 1.0) = 2.0 # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0 # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0 - expected_X = 2.0 - expected_Y = 5.0 - expected_Z = 15.0 - - # Check X values - if np.allclose(X, expected_X, rtol=1e-6, atol=1e-6): - print(f"✅ X values correct: all {expected_X}") - else: - actual_x = X[0] if len(X) > 0 else "N/A" - print( - f"❌ X values incorrect: expected {expected_X}, got {actual_x} (diff: {abs(actual_x - expected_X):.2e})" - ) - - # Check Y values - if np.allclose(Y, expected_Y, rtol=1e-6, atol=1e-6): - print(f"✅ Y values correct: all {expected_Y}") - else: - actual_y = Y[0] if len(Y) > 0 else "N/A" - print( - f"❌ Y values incorrect: expected {expected_Y}, got {actual_y} (diff: {abs(actual_y - expected_Y):.2e})" - ) - - # Check Z values - if np.allclose(Z, expected_Z, rtol=1e-6, atol=1e-6): - print(f"✅ Z values correct: all {expected_Z}") - else: - actual_z = Z[0] if len(Z) > 0 else "N/A" - print( - f"❌ Z values incorrect: expected {expected_Z}, got {actual_z} (diff: {abs(actual_z - expected_Z):.2e})" - ) - - print(f"Sample values: X[0]={X[0]}, Y[0]={Y[0]}, Z[0]={Z[0]}") + assert np.allclose(X, 2.0) + assert np.allclose(Y, 5.0) + assert np.allclose(Z, 15.0) @cuda.jit @@ -196,8 +161,7 @@ def test_numba2d(): u_out_ref[:, -1] = u[:, -1] # compare with the GPU result - max_abs_diff = np.abs(u_out - u_out_ref).max() - print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") + assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6) def test_numba_exec_place(): @@ -218,7 +182,6 @@ def test_numba_exec_place(): with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - print(nb_stream) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) @@ -261,7 +224,6 @@ def test_numba_places(): with ctx.task(lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - print(nb_stream) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[32, 64, nb_stream](2.0, dX, dY) @@ -280,6 +242,5 @@ def test_numba_places(): if __name__ == "__main__": - print("Running CUDASTF examples...") test_numba_graph() # test_numba() diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py index c4b337e801d..02a7bc1c1b3 100644 --- a/python/cuda_cccl/tests/stf/test_pytorch.py +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -58,40 +58,13 @@ def test_pytorch(): ctx.finalize() # Verify results on host after finalize - print("Verifying results...") - # Expected values: # X: 1.0 -> 2.0 (multiplied by 2) # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) - - expected_X = 2.0 - expected_Y = 4.0 - expected_Z = 5.0 - - # Check a few values to verify correctness - assert np.allclose(X[:10], expected_X), ( - f"X mismatch: got {X[:10]}, expected {expected_X}" - ) - assert np.allclose(Y[:10], expected_Y), ( - f"Y mismatch: got {Y[:10]}, expected {expected_Y}" - ) - assert np.allclose(Z[:10], expected_Z), ( - f"Z mismatch: got {Z[:10]}, expected {expected_Z}" - ) - - # Check entire arrays - assert np.all(X == expected_X), ( - f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}" - ) - assert np.all(Y == expected_Y), ( - f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}" - ) - assert np.all(Z == expected_Z), ( - f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}" - ) - - print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}") + assert np.allclose(X, 2.0) + assert np.allclose(Y, 4.0) + assert np.allclose(Z, 5.0) def test_pytorch_task(): @@ -132,42 +105,14 @@ def test_pytorch_task(): ctx.finalize() # Verify results on host after finalize (same as original test) - print("Verifying pytorch_task results...") - # Expected values: # X: 1.0 -> 2.0 (multiplied by 2) # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) - - expected_X = 2.0 - expected_Y = 4.0 - expected_Z = 5.0 - - # Check a few values to verify correctness - assert np.allclose(X[:10], expected_X), ( - f"X mismatch: got {X[:10]}, expected {expected_X}" - ) - assert np.allclose(Y[:10], expected_Y), ( - f"Y mismatch: got {Y[:10]}, expected {expected_Y}" - ) - assert np.allclose(Z[:10], expected_Z), ( - f"Z mismatch: got {Z[:10]}, expected {expected_Z}" - ) - - # Check entire arrays - assert np.all(X == expected_X), ( - f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}" - ) - assert np.all(Y == expected_Y), ( - f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}" - ) - assert np.all(Z == expected_Z), ( - f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}" - ) - - print(f"✅ All pytorch_task checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}") + assert np.allclose(X, 2.0) + assert np.allclose(Y, 4.0) + assert np.allclose(Z, 5.0) if __name__ == "__main__": - print("Running CUDASTF examples...") test_pytorch() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py index 07fa270ee17..e8571edeae3 100644 --- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -76,10 +76,8 @@ def test_numba2d(): u_out_ref[:, -1] = u[:, -1] # compare with the GPU result - max_abs_diff = np.abs(u_out - u_out_ref).max() - print(f"max(|gpu - ref|) = {max_abs_diff:.3e}") + assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6) if __name__ == "__main__": - print("Running CUDASTF stencil decorator example...") test_numba2d() diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py index 04ceb920b6a..abadab8305d 100644 --- a/python/cuda_cccl/tests/stf/test_token.py +++ b/python/cuda_cccl/tests/stf/test_token.py @@ -56,7 +56,6 @@ def test_numba_token(): with ctx.task(lX.read(), lY.rw(), token.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - print(nb_stream) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) @@ -76,10 +75,7 @@ def test_numba_token(): assert np.allclose(Y, 5.0), ( f"Y should be 5.0 after two axpy operations, but got {Y[0]}" ) - print(f"✓ X = {X[0]} (expected 1.0)") - print(f"✓ Y = {Y[0]} (expected 5.0)") if __name__ == "__main__": - print("Running CUDASTF examples...") test_token() From d90ed649bde1b1377e951c1cc1c884f195e2c6ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 13:41:18 +0100 Subject: [PATCH 206/221] cmake fix --- python/cuda_cccl/CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 0e4625b9f25..ae855cf60d4 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -18,12 +18,6 @@ message( # Build cccl.c.parallel and add CCCL's install rules set(_cccl_root ../..) -include(${_cccl_root}/cmake/AppendOptionIfAvailable.cmake) -include(${_cccl_root}/cmake/CCCLConfigureTarget.cmake) -include(${_cccl_root}/cmake/CCCLBuildCompilerTargets.cmake) -include(${_cccl_root}/cmake/CCCLGetDependencies.cmake) -cccl_build_compiler_targets() - # Build and install C++ library first set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds set(CCCL_ENABLE_C_PARALLEL ON) From eb77519bcfa33cbb966d9cad8686034b7e7ac031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 14:13:37 +0100 Subject: [PATCH 207/221] Cmake fixes (need extra cleanup) --- python/cuda_cccl/CMakeLists.txt | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index ae855cf60d4..ebabc475330 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -22,9 +22,19 @@ set(_cccl_root ../..) set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds set(CCCL_ENABLE_C_PARALLEL ON) set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory) -set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build -set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features + +# Disable all testing, examples, and benchmarks - we only want the libraries +set(CCCL_ENABLE_TESTING OFF) +set(CCCL_ENABLE_EXAMPLES OFF) +set(CCCL_ENABLE_BENCHMARKS OFF) +set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) +set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) +# Note: CCCL_ENABLE_CUDAX must be ON because STF depends on it (via CCCL_ENABLE_UNSTABLE) +# But disable cudax tests, examples, and header testing +set(cudax_ENABLE_TESTING OFF) +set(cudax_ENABLE_EXAMPLES OFF) +set(cudax_ENABLE_HEADER_TESTING OFF) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) @@ -42,6 +52,11 @@ add_subdirectory(${_cccl_root} _parent_cccl) set(CMAKE_INSTALL_LIBDIR "${old_libdir}") # pop set(CMAKE_INSTALL_INCLUDEDIR "${old_includedir}") # pop +# Create CCCL::cudax alias for STF (normally created by cccl-config.cmake) +if (TARGET cudax::cudax AND NOT TARGET CCCL::cudax) + add_library(CCCL::cudax ALIAS cudax::cudax) +endif() + # ensure the destination directory exists file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl") file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl") From b38ff802e4cc6cf15ac79dfa0518e42f8b8ef69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 15:58:20 +0100 Subject: [PATCH 208/221] Work-around for lazy resource init during graph capture in cuda core --- python/cuda_cccl/tests/stf/test_decorator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index ce8fad1d69b..2b4552e386b 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -4,6 +4,7 @@ from numba import cuda import cuda.stf as stf +from cuda.core.experimental import Device numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 @@ -26,6 +27,10 @@ def scale(a, x): def test_decorator(use_graph): X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) + # XXX Work-around to force the initialization of CUDA devices in cuda.core and + # avoid lazy resource init during graph capture. + Device().set_current() + ctx = stf.context(use_graph=use_graph) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) From 0a3e6671bc52837ce2b0a1e367d403e0ae4166d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 16:41:04 +0100 Subject: [PATCH 209/221] Use a relaxed capture mode --- cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index b3a24b8fbdd..a070fc446ff 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -104,7 +104,7 @@ public: { // Select a stream from the pool capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; - cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); } auto& dot = *ctx.get_dot(); @@ -365,7 +365,7 @@ public: capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; cudaGraph_t childGraph = nullptr; - cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); // Launch the user provided function f(capture_stream); @@ -625,7 +625,7 @@ public: cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; cudaGraph_t childGraph = nullptr; - cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); + cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); // Launch the user provided function if constexpr (fun_invocable_stream_deps) From 8642fdd92efdaa4d63eac3042f3f910a253d0f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 16:46:50 +0100 Subject: [PATCH 210/221] This work-around is not needed anymore with a relaxed capture mode --- python/cuda_cccl/tests/stf/test_decorator.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py index 2b4552e386b..ce8fad1d69b 100644 --- a/python/cuda_cccl/tests/stf/test_decorator.py +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -4,7 +4,6 @@ from numba import cuda import cuda.stf as stf -from cuda.core.experimental import Device numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 @@ -27,10 +26,6 @@ def scale(a, x): def test_decorator(use_graph): X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) - # XXX Work-around to force the initialization of CUDA devices in cuda.core and - # avoid lazy resource init during graph capture. - Device().set_current() - ctx = stf.context(use_graph=use_graph) lX = ctx.logical_data(X) lY = ctx.logical_data(Y) From 0f9865d99866561db59aa5aadf3d7f24bec3b6f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 17:27:51 +0100 Subject: [PATCH 211/221] cleanup warp example --- .../cuda_cccl/tests/stf/example_fluid_warp.py | 86 +------------------ 1 file changed, 2 insertions(+), 84 deletions(-) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py index c1d903b9be7..ab3fd406864 100644 --- a/python/cuda_cccl/tests/stf/example_fluid_warp.py +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -29,53 +29,13 @@ import cuda.stf as cudastf +# Add a stf-specific decorator to the wp. namespace def stf_kernel(pyfunc): # let warp decorate normally kernel = wp.kernel(pyfunc) # attach an STF-aware call operator def _stf_call(*args, dim=None, stream=None, **kwargs): - print(f"[STF TRACE] {pyfunc.__name__}") - print(f" dim={dim}, stream={stream}") - - # Enhanced arg display with logical data detection - if args: - print(" args=[") - for i, arg in enumerate(args): - # Detect if argument is or contains STF logical data - is_logical_data = False - symbol = None - - # Check if arg is directly STF logical data - if hasattr(arg, "__class__") and "logical_data" in str(type(arg)): - is_logical_data = True - if hasattr(arg, "symbol") and arg.symbol: - symbol = arg.symbol - # Check if arg has attached STF logical data (Warp array) - elif hasattr(arg, "_stf_ld"): - is_logical_data = True - if hasattr(arg._stf_ld, "symbol") and arg._stf_ld.symbol: - symbol = arg._stf_ld.symbol - # Fallback to _name for Warp arrays - elif hasattr(arg, "_name") and arg._name: - symbol = arg._name - - if is_logical_data: - if symbol: - print(f" [{i}]: '{symbol}' [logical_data]") - else: - print(f" [{i}]: logical_data") - else: - # Regular arguments (scalars, etc.) - if hasattr(arg, "shape"): # Array-like but not logical data - print(f" [{i}]: {type(arg).__name__}") - else: # Scalar value - print(f" [{i}]: {arg}") - print(" ]") - else: - print(f" args={args}") - - print(f" kwargs={kwargs}") return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs) # monkey-patch a method onto the kernel object @@ -85,49 +45,6 @@ def _stf_call(*args, dim=None, stream=None, **kwargs): def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): - print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}") - print(f" dim = {dim}") - print(f" stream = {stream}") - - # Enhanced input display with logical data detection - if inputs: - print(" inputs = [") - for i, inp in enumerate(inputs): - # Detect if input is or contains STF logical data - is_logical_data = False - symbol = None - - # Check if inp is directly STF logical data - if hasattr(inp, "__class__") and "logical_data" in str(type(inp)): - is_logical_data = True - if hasattr(inp, "symbol") and inp.symbol: - symbol = inp.symbol - # Check if inp has attached STF logical data (Warp array) - elif hasattr(inp, "_stf_ld"): - is_logical_data = True - if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol: - symbol = inp._stf_ld.symbol - # Fallback to _name for Warp arrays - elif hasattr(inp, "_name") and inp._name: - symbol = inp._name - - if is_logical_data: - if symbol: - print(f" [{i}]: '{symbol}' [logical_data]") - else: - print(f" [{i}]: logical_data") - else: - # Regular arguments (scalars, etc.) - if hasattr(inp, "shape"): # Array-like but not logical data - print(f" [{i}]: {type(inp).__name__}") - else: # Scalar value - print(f" [{i}]: {inp}") - print(" ]") - else: - print(f" inputs = {inputs}") - - print(f" kwargs = {kwargs}") - # just forward to warp for now return wp.launch( kernel, @@ -387,6 +304,7 @@ def step(self): self.p0.zero_() self.p1.zero_() + # TODO experiment with explicit capture at Warp level # if self.use_cuda_graph: # wp.capture_launch(self.graph) # else: From 6466347ed68b0e369348029424d82e41018c429f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 25 Nov 2025 17:41:10 +0100 Subject: [PATCH 212/221] Cleanups in the cython code for STF --- python/cuda_cccl/cuda/stf/_stf_bindings.py | 22 ++++++++++++-- .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 29 +++++-------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py index c61e908fc8d..169490739ee 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings.py +++ b/python/cuda_cccl/cuda/stf/_stf_bindings.py @@ -27,9 +27,26 @@ def _load_cuda_libraries(): - # Load appropriate libraries for the detected CUDA version + """ + Preload CUDA libraries to ensure proper symbol resolution. + + These libraries are indirect dependencies pulled in via cccl.c.parallel. + Preloading ensures reliable symbol resolution regardless of dynamic linker behavior. + """ + import warnings + for libname in ("nvrtc", "nvJitLink"): - load_nvidia_dynamic_lib(libname) + try: + load_nvidia_dynamic_lib(libname) + except Exception as e: + # Log warning but don't fail - the extension might still work + # if the libraries are already loaded or available through other means + warnings.warn( + f"Failed to preload CUDA library '{libname}': {e}. " + f"STF bindings may fail to load if {libname} is not available.", + RuntimeWarning, + stacklevel=2, + ) _load_cuda_libraries() @@ -53,4 +70,5 @@ def _load_cuda_libraries(): except ImportError as e: raise ImportError( f"Failed to import CUDA STF bindings for CUDA {cuda_version}. " + f"Ensure cuda-cccl is properly installed with: pip install cuda-cccl[cu{cuda_version}]" ) from e diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index 00b8dd39ed5..afc2f431192 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -6,24 +6,19 @@ # Make sure to update PYI with change to Python API to ensure that Python # static type checker tools like mypy green-lights cuda.cccl.parallel -from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release -from cpython.buffer cimport Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyObject_GetBuffer, PyBuffer_Release -from cpython.bytes cimport PyBytes_FromStringAndSize -from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t -from libc.stdint cimport uintptr_t -from libc.string cimport memset, memcpy -import math # for math.prod - -# TODO remove that dependency -import numpy as np - from cpython.buffer cimport ( - Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, - PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer + Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, + PyObject_GetBuffer, PyBuffer_Release, PyObject_CheckBuffer ) +from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.pycapsule cimport ( PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer ) +from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t +from libc.string cimport memset, memcpy +import math # for math.prod + +import numpy as np import ctypes from enum import IntFlag @@ -38,9 +33,6 @@ cdef extern from "": ctypedef OpaqueCUkernel_st *CUkernel ctypedef OpaqueCUlibrary_st *CUlibrary -#typedef struct CUstream_st* cudaStream_t; - - cdef extern from "cccl/c/experimental/stf/stf.h": # # Contexts @@ -132,7 +124,6 @@ cdef extern from "cccl/c/experimental/stf/stf.h": void stf_task_end(stf_task_handle t) void stf_task_enable_capture(stf_task_handle t) CUstream stf_task_get_custream(stf_task_handle t) - # cudaStream_t stf_task_get_stream(stf_task_handle t) void* stf_task_get(stf_task_handle t, int submitted_index) void stf_task_destroy(stf_task_handle t) @@ -222,8 +213,6 @@ cdef class logical_data: # Unknown vector type - treat as original self._shape = original_shape self._dtype = np.dtype(typestr) - - print(f"STF: Automatically flattened vector type {typestr} -> {self._dtype} with shape {self._shape}") else: # Regular scalar type self._shape = original_shape @@ -472,7 +461,6 @@ cdef class task: def __dealloc__(self): if self._t != NULL: stf_task_destroy(self._t) -# self._lds_args.clear() def start(self): # This is ignored if this is not a graph task @@ -648,7 +636,6 @@ cdef class context: raise RuntimeError("cannot call borrow_from_handle on this context") self._ctx = ctx_handle - # print(f"borrowing ... new ctx handle = {ctx_handle} self={self}") def __repr__(self): return f"context(handle={self._ctx}, borrowed={self._borrowed})" From cfb2930ca2118d1ca80e5ead273bbb36423fa269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 08:28:05 +0100 Subject: [PATCH 213/221] no need for math.prod for such a simple thing --- python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index afc2f431192..4938dc2b970 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -16,7 +16,6 @@ from cpython.pycapsule cimport ( ) from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t from libc.string cimport memset, memcpy -import math # for math.prod import numpy as np @@ -333,7 +332,10 @@ cdef class logical_data: out._dtype = np.dtype(dtype) out._shape = shape out._ndim = len(shape) - out._len = math.prod(shape) * out._dtype.itemsize + cdef size_t total_items = 1 + for dim in shape: + total_items *= dim + out._len = total_items * out._dtype.itemsize out._symbol = None # New object has no symbol initially out._is_token = False stf_logical_data_empty(ctx._ctx, out._len, &out._ld) From 130ee2a5cfec2469be494501d15610d0df6abdbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 08:33:39 +0100 Subject: [PATCH 214/221] Simpler code to handle vector types --- .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index 4938dc2b970..31c499ecbe4 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -194,28 +194,21 @@ cdef class logical_data: original_shape = cai['shape'] typestr = cai['typestr'] - # Handle vector types automatically (e.g., wp.vec2, wp.vec3) - # STF treats these as flat scalar arrays with an additional dimension - if typestr.startswith('|V'): # Vector type (e.g., '|V8' for vec2, '|V12' for vec3) - vector_size = int(typestr[2:]) # Extract size from '|V8' -> 8 bytes - - if vector_size == 8: # vec2 (2 * 4 bytes float32) - self._shape = original_shape + (2,) - self._dtype = np.dtype(' Date: Wed, 26 Nov 2025 08:38:23 +0100 Subject: [PATCH 215/221] fix grid dimension --- python/cuda_cccl/tests/stf/test_numba.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py index c0ca54375e7..bd818e13894 100644 --- a/python/cuda_cccl/tests/stf/test_numba.py +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -55,27 +55,29 @@ def test_numba(): lY = ctx.logical_data(Y) lZ = ctx.logical_data(Z) + threads_per_block = 256 + blocks = (n + threads_per_block - 1) // threads_per_block + with ctx.task(lX.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.numba_arguments() - # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False) - scale[32, 64, nb_stream](2.0, dX) + scale[blocks, threads_per_block, nb_stream](2.0, dX) with ctx.task(lX.read(), lY.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX = t.get_arg_numba(0) dY = t.get_arg_numba(1) - axpy[32, 64, nb_stream](2.0, dX, dY) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) with ctx.task(lX.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dX, dZ = t.numba_arguments() - axpy[32, 64, nb_stream](2.0, dX, dZ) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dZ) with ctx.task(lY.read(), lZ.rw()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) dY, dZ = t.numba_arguments() - axpy[32, 64, nb_stream](2.0, dY, dZ) + axpy[blocks, threads_per_block, nb_stream](2.0, dY, dZ) ctx.finalize() From b8c745ea1db7f842b479ad91263388d9488c7809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 08:39:59 +0100 Subject: [PATCH 216/221] Use from_dlpack --- python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py index 945597fb360..0ec74de3ed7 100644 --- a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py +++ b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py @@ -18,7 +18,7 @@ def cai_to_torch(cai: dict): from numba import cuda as _cuda dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False) - return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack()) + return torch.from_dlpack(dev_array) except Exception: pass @@ -31,7 +31,7 @@ def __init__(self, d): self.__cuda_array_interface__ = d cp_arr = cp.asarray(_cai_wrapper(cai)) - return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack()) + return torch.from_dlpack(cp_arr) except Exception as e: raise RuntimeError( "Could not convert __cuda_array_interface__ to torch.Tensor. " From fb2a3baad84202a45010f06369b640029a8210b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 10:09:01 +0100 Subject: [PATCH 217/221] Change the mock-up FHE toy example to have operations that are homomorphic, not XOR --- python/cuda_cccl/tests/stf/test_fhe.py | 98 ++++++++----------- .../cuda_cccl/tests/stf/test_fhe_decorator.py | 89 ++++++++--------- 2 files changed, 86 insertions(+), 101 deletions(-) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py index e613d37ea76..b2bb9961b84 100644 --- a/python/cuda_cccl/tests/stf/test_fhe.py +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# A toy example to illustrate how we can compose logical operations +# Toy Fully Homomorphic Encryption (FHE) example with addition encryption import numba from numba import cuda @@ -13,9 +13,9 @@ class Plaintext: - # Initialize from actual values, or from a logical data - def __init__(self, ctx, values=None, ld=None): + def __init__(self, ctx, values=None, ld=None, key=0x42): self.ctx = ctx + self.key = key if ld is not None: self.l = ld if values is not None: @@ -28,8 +28,8 @@ def set_symbol(self, symbol: str): self.symbol = symbol def encrypt(self) -> "Ciphertext": - encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR - return Ciphertext(self.ctx, values=encrypted) + encrypted = bytearray([(c + self.key) & 0xFF for c in self.values]) + return Ciphertext(self.ctx, values=encrypted, key=self.key) def print_values(self): with ctx.task( @@ -42,36 +42,30 @@ def print_values(self): @cuda.jit -def and_kernel(a, b, out): +def add_kernel(a, b, out): i = cuda.grid(1) if i < out.size: - out[i] = a[i] & b[i] + out[i] = (a[i] + b[i]) & 0xFF @cuda.jit -def or_kernel(a, b, out): +def sub_kernel(a, b, out): i = cuda.grid(1) if i < out.size: - out[i] = a[i] | b[i] + out[i] = (a[i] - b[i]) & 0xFF @cuda.jit -def not_kernel(a, out): +def sub_scalar_kernel(a, out, v): i = cuda.grid(1) if i < out.size: - out[i] = ~a[i] - - -@cuda.jit -def xor_kernel(a, out, v): - i = cuda.grid(1) - if i < out.size: - out[i] = a[i] ^ v + out[i] = (a[i] - v) & 0xFF class Ciphertext: - def __init__(self, ctx, values=None, ld=None): + def __init__(self, ctx, values=None, ld=None, key=0x42): self.ctx = ctx + self.key = key if ld is not None: self.l = ld if values is not None: @@ -79,72 +73,52 @@ def __init__(self, ctx, values=None, ld=None): self.l = ctx.logical_data(self.values) self.symbol = None - # ~ operator - def __invert__(self): - result = self.like_empty() - - with ctx.task(self.l.read(), result.l.write()) as t: - nb_stream = cuda.external_stream(t.stream_ptr()) - da, dresult = t.numba_arguments() - not_kernel[32, 16, nb_stream](da, dresult) - - return result - - # | operator - def __or__(self, other): + def __add__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = self.like_empty() - with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) da, db, dresult = t.numba_arguments() - or_kernel[32, 16, nb_stream](da, db, dresult) - + add_kernel[32, 16, nb_stream](da, db, dresult) return result - # & operator - def __and__(self, other): + def __sub__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = self.like_empty() - with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) - nb_stream.synchronize() da, db, dresult = t.numba_arguments() - and_kernel[32, 16, nb_stream](da, db, dresult) - + sub_kernel[32, 16, nb_stream](da, db, dresult) return result def set_symbol(self, symbol: str): self.l.set_symbol(symbol) self.symbol = symbol - def decrypt(self): + def decrypt(self, num_operands=2): + """Decrypt by subtracting num_operands * key""" result = self.like_empty() - + total_key = (num_operands * self.key) & 0xFF with ctx.task(self.l.read(), result.l.write()) as t: nb_stream = cuda.external_stream(t.stream_ptr()) da, dresult = t.numba_arguments() - # reverse the toy XOR "encryption" - xor_kernel[32, 16, nb_stream](da, dresult, 0x42) - - return Plaintext(self.ctx, ld=result.l) + sub_scalar_kernel[32, 16, nb_stream](da, dresult, total_key) + return Plaintext(self.ctx, ld=result.l, key=self.key) def like_empty(self): return Ciphertext(self.ctx, ld=self.l.like_empty()) -def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: - return ~((eA | ~eB) & (~eA | eB)) +def circuit(a, b): + """Circuit: (A + B) + (B - A) = 2*B""" + return (a + b) + (b - a) def test_fhe(): - """Test Fully Homomorphic Encryption (FHE) example with logical operations.""" - global ctx # Make ctx accessible to the classes + """Test FHE using manual task creation with addition encryption.""" + global ctx ctx = stf.context(use_graph=False) vA = [3, 3, 2, 2, 17] @@ -155,13 +129,27 @@ def test_fhe(): pB = Plaintext(ctx, vB) pB.set_symbol("B") + expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)] + eA = pA.encrypt() eB = pB.encrypt() - out = circuit(eA, eB) + encrypted_out = circuit(eA, eB) + decrypted_out = encrypted_out.decrypt(num_operands=2) + + with ctx.task( + stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = t.numba_arguments() + actual = [int(v) for v in hvalues] - out.decrypt().print_values() ctx.finalize() + assert actual == expected, ( + f"Decrypted result {actual} doesn't match expected {expected}" + ) + if __name__ == "__main__": test_fhe() diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py index 969cbe08668..980f7735ddc 100644 --- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# A toy example to illustrate how we can compose logical operations +# Toy Fully Homomorphic Encryption (FHE) example with addition encryption import numba from numba import cuda @@ -13,9 +13,9 @@ class Plaintext: - # Initialize from actual values, or from a logical data - def __init__(self, ctx, values=None, ld=None): + def __init__(self, ctx, values=None, ld=None, key=0x42): self.ctx = ctx + self.key = key if ld is not None: self.l = ld if values is not None: @@ -28,8 +28,8 @@ def set_symbol(self, symbol: str): self.symbol = symbol def encrypt(self) -> "Ciphertext": - encrypted = bytearray([c ^ 0x42 for c in self.values]) # toy XOR - return Ciphertext(self.ctx, values=encrypted) + encrypted = bytearray([(c + self.key) & 0xFF for c in self.values]) + return Ciphertext(self.ctx, values=encrypted, key=self.key) def print_values(self): with ctx.task( @@ -42,36 +42,30 @@ def print_values(self): @cudastf.jit -def and_kernel(a, b, out): +def add_kernel(a, b, out): i = cuda.grid(1) if i < out.size: - out[i] = a[i] & b[i] + out[i] = (a[i] + b[i]) & 0xFF @cudastf.jit -def or_kernel(a, b, out): +def sub_kernel(a, b, out): i = cuda.grid(1) if i < out.size: - out[i] = a[i] | b[i] + out[i] = (a[i] - b[i]) & 0xFF @cudastf.jit -def not_kernel(a, out): +def sub_scalar_kernel(a, out, v): i = cuda.grid(1) if i < out.size: - out[i] = ~a[i] - - -@cudastf.jit -def xor_kernel(a, out, v): - i = cuda.grid(1) - if i < out.size: - out[i] = a[i] ^ v + out[i] = (a[i] - v) & 0xFF class Ciphertext: - def __init__(self, ctx, values=None, ld=None): + def __init__(self, ctx, values=None, ld=None, key=0x42): self.ctx = ctx + self.key = key if ld is not None: self.l = ld if values is not None: @@ -79,54 +73,43 @@ def __init__(self, ctx, values=None, ld=None): self.l = ctx.logical_data(self.values) self.symbol = None - # ~ operator - def __invert__(self): - result = self.like_empty() - not_kernel[32, 16](self.l.read(), result.l.write()) - - return result - - # | operator - def __or__(self, other): + def __add__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = self.like_empty() - or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) - + add_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result - # & operator - def __and__(self, other): + def __sub__(self, other): if not isinstance(other, Ciphertext): return NotImplemented - result = self.like_empty() - and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) - + sub_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) return result def set_symbol(self, symbol: str): self.l.set_symbol(symbol) self.symbol = symbol - def decrypt(self): + def decrypt(self, num_operands=2): + """Decrypt by subtracting num_operands * key""" result = self.like_empty() - xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42) - - return Plaintext(self.ctx, ld=result.l) + total_key = (num_operands * self.key) & 0xFF + sub_scalar_kernel[32, 16](self.l.read(), result.l.write(), total_key) + return Plaintext(self.ctx, ld=result.l, key=self.key) def like_empty(self): return Ciphertext(self.ctx, ld=self.l.like_empty()) -def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext: - return ~((eA | ~eB) & (~eA | eB)) +def circuit(a, b): + """Circuit: (A + B) + (B - A) = 2*B""" + return (a + b) + (b - a) def test_fhe_decorator(): - """Test Fully Homomorphic Encryption (FHE) example using @cudastf.jit decorators.""" - global ctx # Make ctx accessible to the classes + """Test FHE using @cudastf.jit decorators with addition encryption.""" + global ctx ctx = cudastf.context(use_graph=False) vA = [3, 3, 2, 2, 17] @@ -137,13 +120,27 @@ def test_fhe_decorator(): pB = Plaintext(ctx, vB) pB.set_symbol("B") + expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)] + eA = pA.encrypt() eB = pB.encrypt() - out = circuit(eA, eB) + encrypted_out = circuit(eA, eB) + decrypted_out = encrypted_out.decrypt(num_operands=2) + + with ctx.task( + cudastf.exec_place.host(), decrypted_out.l.read(cudastf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = t.numba_arguments() + actual = [int(v) for v in hvalues] - out.decrypt().print_values() ctx.finalize() + assert actual == expected, ( + f"Decrypted result {actual} doesn't match expected {expected}" + ) + if __name__ == "__main__": test_fhe_decorator() From da2e1aa7c6b5fd4a3b9ce6296648147df837c685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 10:16:22 +0100 Subject: [PATCH 218/221] Add some explanation for the use of a relaxed capture mode --- cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index a070fc446ff..ee8c5435822 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -104,6 +104,8 @@ public: { // Select a stream from the pool capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; + // Use relaxed capture mode to allow capturing workloads that lazily initialize + // resources (e.g., set up memory pools) cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); } @@ -365,6 +367,8 @@ public: capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; cudaGraph_t childGraph = nullptr; + // Use relaxed capture mode to allow capturing workloads that lazily initialize + // resources (e.g., set up memory pools) cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); // Launch the user provided function @@ -625,6 +629,8 @@ public: cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream; cudaGraph_t childGraph = nullptr; + // Use relaxed capture mode to allow capturing workloads that lazily initialize + // resources (e.g., set up memory pools) cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed)); // Launch the user provided function From 852b4005c9bbc006744a63e24d4687d6c55d4a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 12:52:34 +0100 Subject: [PATCH 219/221] cleaner pytorch adapter --- .../cuda/stf/_adapters/torch_bridge.py | 33 +++---------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py index 0ec74de3ed7..0e7686ea363 100644 --- a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py +++ b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py @@ -6,34 +6,11 @@ def cai_to_torch(cai: dict): Convert a __cuda_array_interface__ dict to a torch.Tensor without making PyTorch a hard dependency of the core extension. - Strategy (in order): - 1) Try Numba -> DLPack -> torch (fast & common). - 2) Try CuPy -> DLPack -> torch (common on CUDA setups). - 3) Otherwise, error with a clear message. + Uses Numba (a required dependency) to create a DeviceNDArray, + which torch.as_tensor can consume directly via __cuda_array_interface__. """ import torch + from numba import cuda as _cuda - # 1) Numba bridge - try: - from numba import cuda as _cuda - - dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False) - return torch.from_dlpack(dev_array) - except Exception: - pass - - # 2) CuPy bridge - try: - import cupy as cp - - class _cai_wrapper: - def __init__(self, d): - self.__cuda_array_interface__ = d - - cp_arr = cp.asarray(_cai_wrapper(cai)) - return torch.from_dlpack(cp_arr) - except Exception as e: - raise RuntimeError( - "Could not convert __cuda_array_interface__ to torch.Tensor. " - "Install numba or cupy (or expose a DLPack capsule natively)." - ) from e + dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False) + return torch.as_tensor(dev_array) From 09913dc3f8f706919d7c0dda8c66fd639ef3b8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Wed, 26 Nov 2025 13:06:55 +0100 Subject: [PATCH 220/221] Code simplification --- .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index 31c499ecbe4..c5b1e5222b1 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -196,19 +196,15 @@ cdef class logical_data: # Handle vector types (e.g., wp.vec2, wp.vec3) # Use structured dtype from descr if available - if typestr.startswith('|V'): - # Vector/structured type - use descr field if available - if 'descr' in cai: - self._dtype = np.dtype(cai['descr']) - self._shape = original_shape - else: - # No descr field - treat as opaque bytes - self._dtype = np.dtype(typestr) - self._shape = original_shape + if typestr.startswith('|V') and 'descr' in cai: + # Vector/structured type - use descr field + self._dtype = np.dtype(cai['descr']) else: - # Regular scalar type + # Regular scalar type or vector without descr - use typestr self._dtype = np.dtype(typestr) - self._shape = original_shape + + # Shape is always the same regardless of type + self._shape = original_shape self._ndim = len(self._shape) @@ -624,7 +620,7 @@ cdef class context: stf_ctx_create(&self._ctx) cdef borrow_from_handle(self, stf_ctx_handle ctx_handle): - if not self._ctx == NULL: + if self._ctx != NULL: raise RuntimeError("context already initialized") if not self._borrowed: From 237b2c1380aae1e9d3cd25ca8e265f9d2652a2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= Date: Tue, 16 Dec 2025 15:36:47 +0100 Subject: [PATCH 221/221] minor fixes --- python/cuda_cccl/CMakeLists.txt | 3 +-- python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index ebabc475330..bcfb3b084f9 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -108,7 +108,6 @@ endif() set(CYTHON_FLAGS "-3 -M -t -w \"${cuda_cccl_SOURCE_DIR}\"") string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}") -# Only building STF bindings - parallel bindings not needed message(STATUS "Using Cython ${CYTHON_VERSION}") set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/compute/_bindings_impl.pyx") @@ -199,5 +198,5 @@ set_target_properties( PROPERTIES INSTALL_RPATH "$ORIGIN/cccl" ) -install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR}) install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR}) +install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR}) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx index c5b1e5222b1..12f8fba3114 100644 --- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -506,7 +506,7 @@ cdef class task: def get_arg_cai(self, index): ptr = self.get_arg(index) - return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=0).__cuda_array_interface__ + return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=self.stream_ptr()).__cuda_array_interface__ def get_arg_numba(self, index): cai = self.get_arg_cai(index)