From f2a377757631e3c02ce507abb1ad3e44e0d566e5 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Jul 2025 13:54:51 +0200
Subject: [PATCH 001/221] Allow CUfunction (driver API) in the
 cuda_kernel(_chain) API

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 130 ++++++++++++++----
 1 file changed, 100 insertions(+), 30 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 79beb07dc03..3e5018062cb 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -47,7 +47,7 @@ struct cuda_kernel_desc
 {
   template <typename Fun, typename... Args>
   cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
-      : func((const void*) func)
+      : func_variant(store_func(mv(func)))
       , gridDim(gridDim_)
       , blockDim(blockDim_)
       , sharedMem(sharedMem_)
@@ -57,16 +57,19 @@ struct cuda_kernel_desc
     // We first copy all arguments into a tuple because the kernel
     // implementation needs pointers to the argument, so we cannot use
     // directly those passed in the pack of arguments
-    auto arg_tuple = ::std::make_shared<TupleType>(std::forward<Args>(args)...);
+    auto arg_tuple = ::std::make_shared<TupleType>(mv(args)...);
 
-    // Ensure we are packing arguments of the proper types to call func
-    static_assert(::std::is_invocable_v<Fun, Args...>);
+    // Ensure we are packing arguments of the proper types to call func (only
+    // valid with the runtime API)
+    if constexpr (!::std::is_same_v<CUfunction, Fun>) {
+        static_assert(::std::is_invocable_v<Fun, Args...>);
+    }
 
     // Get the address of every tuple entry
     ::std::apply(
       [this](auto&... elems) {
         // Push back the addresses of each tuple element into the args vector
-        ((args_ptr.push_back(static_cast<void*>(&elems))), ...);
+        ((args_ptr.push_back(&elems)), ...);
       },
       *arg_tuple);
 
@@ -74,17 +77,99 @@ struct cuda_kernel_desc
     arg_tuple_type_erased = mv(arg_tuple);
   }
 
-  /* __global__ function */
-  const void* func;
+  /* CUfunction (CUDA driver API) or __global__ function (CUDA runtime API) */
+  using func_variant_t = ::std::variant<CUfunction, const void*>;
+  func_variant_t func_variant;
   dim3 gridDim;
   dim3 blockDim;
-  size_t sharedMem;
+  size_t sharedMem = 0;
 
   // Vector of pointers to the arg_tuple which saves arguments in a typed-erased way
-  ::std::vector<void*> args_ptr;
+  // Mutable so that launch can be const
+  mutable ::std::vector<void*> args_ptr;
+
+  // Helper to launch the kernel using CUDA stream based API
+  void launch(cudaStream_t stream) const
+  {
+    ::std::visit(
+      [&](auto&& kernel_func) {
+        using T = ::std::decay_t<decltype(kernel_func)>;
+        if constexpr (::std::is_same_v<T, const void*>)
+        {
+          cuda_safe_call(cudaLaunchKernel(kernel_func, gridDim, blockDim, args_ptr.data(), sharedMem, stream));
+        }
+        else
+        {
+          static_assert(::std::is_same_v<T, CUfunction>, "Unsupported function type in func_variant");
+          cuda_safe_call(cuLaunchKernel(
+            kernel_func,
+            gridDim.x,
+            gridDim.y,
+            gridDim.z,
+            blockDim.x,
+            blockDim.y,
+            blockDim.z,
+            sharedMem,
+            stream,
+            args_ptr.data(),
+            nullptr));
+        }
+      },
+      func_variant);
+  }
+
+  void launch_in_graph(cudaGraphNode_t& node, cudaGraph_t& graph) const
+  {
+    ::std::visit(
+      [&](auto&& kernel_func) {
+        using T = ::std::decay_t<decltype(kernel_func)>;
+
+        if constexpr (::std::is_same_v<T, CUfunction>)
+        {
+          CUDA_KERNEL_NODE_PARAMS params{
+            .func           = kernel_func,
+            .gridDimX       = gridDim.x,
+            .gridDimY       = gridDim.y,
+            .gridDimZ       = gridDim.z,
+            .blockDimX      = blockDim.x,
+            .blockDimY      = blockDim.y,
+            .blockDimZ      = blockDim.z,
+            .sharedMemBytes = static_cast<unsigned>(sharedMem),
+            .kernelParams   = const_cast<void**>(args_ptr.data()),
+            .extra          = nullptr,
+            .kern           = nullptr,
+            .ctx            = nullptr};
+          cuda_safe_call(cuGraphAddKernelNode(&node, graph, nullptr, 0, &params));
+        }
+        else
+        {
+          static_assert(::std::is_same_v<T, const void*>, "Unsupported kernel function type");
+          cudaKernelNodeParams params{
+            .func           = const_cast<void*>(kernel_func),
+            .gridDim        = gridDim,
+            .blockDim       = blockDim,
+            .sharedMemBytes = static_cast<unsigned>(sharedMem),
+            .kernelParams   = args_ptr.data(),
+            .extra          = nullptr};
+          cuda_safe_call(cudaGraphAddKernelNode(&node, graph, nullptr, 0, &params));
+        }
+      },
+      func_variant);
+  }
 
 private:
   ::std::shared_ptr<void> arg_tuple_type_erased;
+
+  static func_variant_t store_func(CUfunction f)
+  {
+    return f;
+  }
+
+  template <typename T>
+  static func_variant_t store_func(T* f)
+  {
+    return reinterpret_cast<const void*>(f);
+  }
 };
 
 namespace reserved
@@ -252,7 +337,7 @@ public:
         // graph, or we rely on a child graph
         if (res.size() == 1)
         {
-          insert_one_kernel(res[0], t.get_node(), g);
+          res[0].launch_in_graph(t.get_node(), g);
         }
         else
         {
@@ -262,7 +347,7 @@ public:
           // Create a chain of kernels
           for (size_t i = 0; i < res.size(); i++)
           {
-            insert_one_kernel(res[i], chain[i], g);
+            res[i].launch_in_graph(chain[i], g);
             if (i > 0)
             {
               cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1));
@@ -275,8 +360,7 @@ public:
         // Rely on stream semantic to have a dependency between the kernels
         for (auto& k : res)
         {
-          cuda_safe_call(
-            cudaLaunchKernel(k.func, k.gridDim, k.blockDim, k.args_ptr.data(), k.sharedMem, t.get_stream()));
+          k.launch(t.get_stream());
         }
       }
     }
@@ -287,35 +371,21 @@ public:
       // descriptor, not a vector
       static_assert(!chained);
 
-      cuda_kernel_desc res = ::std::apply(f, deps.instance(t));
+      cuda_kernel_desc res = ::cuda::std::apply(f, deps.instance(t));
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
         auto lock = t.lock_ctx_graph();
-        insert_one_kernel(res, t.get_node(), t.get_ctx_graph());
+        res.launch_in_graph(t.get_node(), t.get_ctx_graph());
       }
       else
       {
-        cuda_safe_call(
-          cudaLaunchKernel(res.func, res.gridDim, res.blockDim, res.args_ptr.data(), res.sharedMem, t.get_stream()));
+        res.launch(t.get_stream());
       }
     }
   }
 
 private:
-  /* Add a kernel to a CUDA graph given its description */
-  auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const
-  {
-    cudaKernelNodeParams kconfig;
-    kconfig.blockDim       = k.blockDim;
-    kconfig.extra          = nullptr;
-    kconfig.func           = const_cast<void*>(k.func);
-    kconfig.gridDim        = k.gridDim;
-    kconfig.kernelParams   = k.args_ptr.data();
-    kconfig.sharedMemBytes = k.sharedMem;
-    cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
-  }
-
   ::std::string symbol;
   Ctx& ctx;
   // Statically defined deps

From b3304a1865338f5ceccb0c78b03b078853bfe474 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Jul 2025 13:57:23 +0200
Subject: [PATCH 002/221] clang-format

---
 .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 3e5018062cb..9abfeb47e31 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -61,8 +61,9 @@ struct cuda_kernel_desc
 
     // Ensure we are packing arguments of the proper types to call func (only
     // valid with the runtime API)
-    if constexpr (!::std::is_same_v<CUfunction, Fun>) {
-        static_assert(::std::is_invocable_v<Fun, Args...>);
+    if constexpr (!::std::is_same_v<CUfunction, Fun>)
+    {
+      static_assert(::std::is_invocable_v<Fun, Args...>);
     }
 
     // Get the address of every tuple entry

From 8651e9fa530b4e3a7e8afccde2a4127a7385a73e Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Jul 2025 14:15:42 +0200
Subject: [PATCH 003/221] We have a std::tuple not a cuda::std::tuple (yet)

---
 .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 9abfeb47e31..7bbe6e12228 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -372,7 +372,7 @@ public:
       // descriptor, not a vector
       static_assert(!chained);
 
-      cuda_kernel_desc res = ::cuda::std::apply(f, deps.instance(t));
+      cuda_kernel_desc res = ::std::apply(f, deps.instance(t));
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {

From fd42c70cdfd3ec6019c959dfe7747130a0bf36ad Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Jul 2025 16:07:58 +0200
Subject: [PATCH 004/221] If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the
 number of registers used by kernels

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 7bbe6e12228..9a8b26804e4 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -158,6 +158,27 @@ struct cuda_kernel_desc
       func_variant);
   }
 
+  // Utility to query the number of registers used by this kernel
+  int get_num_registers() const
+  {
+    return ::std::visit(
+      [](auto&& kernel_func) {
+        using T = ::std::decay_t<decltype(kernel_func)>;
+        if constexpr (::std::is_same_v<T, CUfunction>)
+        {
+          return cuda_try<cuFuncGetAttribute>(CU_FUNC_ATTRIBUTE_NUM_REGS, kernel_func);
+        }
+        else
+        {
+          static_assert(::std::is_same_v<T, const void*>, "Unsupported kernel function type");
+          cudaFuncAttributes func_attr{};
+          cuda_safe_call(cudaFuncGetAttributes(&func_attr, kernel_func));
+          return func_attr.numRegs;
+        }
+      },
+      func_variant);
+  }
+
 private:
   ::std::shared_ptr<void> arg_tuple_type_erased;
 
@@ -323,12 +344,28 @@ public:
       dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
     }
 
+    // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers
+    // used by the kernel(s)
+    static bool display_register_cnt = [] {
+      const char* env = ::std::getenv("CUDASTF_CUDA_KERNEL_DEBUG");
+      return env && (atoi(env) != 0);
+    }();
+
     // When chained is enable, we expect a vector of kernel description which should be executed one after the other
     if constexpr (chained)
     {
       ::std::vector<cuda_kernel_desc> res = ::std::apply(f, deps.instance(t));
       assert(!res.empty());
 
+      if (display_register_cnt)
+      {
+        fprintf(stderr, "cuda_kernel_chain (%s):\n", symbol.c_str());
+        for (size_t i = 0; i < res.size(); i++)
+        {
+          fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, res[i].get_num_registers());
+        }
+      }
+
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
         auto lock = t.lock_ctx_graph();
@@ -373,6 +410,10 @@ public:
       static_assert(!chained);
 
       cuda_kernel_desc res = ::std::apply(f, deps.instance(t));
+      if (display_register_cnt)
+      {
+        fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), res.get_num_registers());
+      }
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {

From f36fcd07fa726c5150ab62528cbe145a3627c824 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 16:27:53 +0200
Subject: [PATCH 005/221] Support CUkernel in addition to CUfunction

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 54 +++++++++++++++----
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 9a8b26804e4..1b3dfcd322a 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -37,6 +37,28 @@ namespace cuda::experimental::stf
 class graph_ctx;
 class stream_ctx;
 
+namespace reserved
+{
+
+template <typename T>
+struct is_function_or_kernel : ::std::false_type
+{};
+
+template <>
+struct is_function_or_kernel<CUfunction> : ::std::true_type
+{};
+
+#if CUDA_VERSION >= 12000
+template <>
+struct is_function_or_kernel<CUkernel> : ::std::true_type
+{};
+#endif
+
+template <typename T>
+inline constexpr bool is_function_or_kernel_v = is_function_or_kernel<T>::value;
+
+} // end namespace reserved
+
 /**
  * @brief Description of a CUDA kernel
  *
@@ -61,7 +83,7 @@ struct cuda_kernel_desc
 
     // Ensure we are packing arguments of the proper types to call func (only
     // valid with the runtime API)
-    if constexpr (!::std::is_same_v<CUfunction, Fun>)
+    if constexpr (!reserved::is_function_or_kernel_v<Fun>)
     {
       static_assert(::std::is_invocable_v<Fun, Args...>);
     }
@@ -78,8 +100,13 @@ struct cuda_kernel_desc
     arg_tuple_type_erased = mv(arg_tuple);
   }
 
-  /* CUfunction (CUDA driver API) or __global__ function (CUDA runtime API) */
-  using func_variant_t = ::std::variant<CUfunction, const void*>;
+  /* CUfunction/CUkernel (CUDA driver API) or __global__ function (CUDA runtime API) */
+  using func_variant_t =
+    ::std::variant<CUfunction,
+#if CUDA_VERSION >= 12000
+                   CUkernel,
+#endif
+                   const void*>;
   func_variant_t func_variant;
   dim3 gridDim;
   dim3 blockDim;
@@ -101,9 +128,11 @@ struct cuda_kernel_desc
         }
         else
         {
-          static_assert(::std::is_same_v<T, CUfunction>, "Unsupported function type in func_variant");
+          static_assert(reserved::is_function_or_kernel_v<T>, "Unsupported function type in func_variant");
+
+          // If this is a CUkernel, the cast to a CUfunction is sufficient
           cuda_safe_call(cuLaunchKernel(
-            kernel_func,
+            (CUfunction) kernel_func,
             gridDim.x,
             gridDim.y,
             gridDim.z,
@@ -125,10 +154,10 @@ struct cuda_kernel_desc
       [&](auto&& kernel_func) {
         using T = ::std::decay_t<decltype(kernel_func)>;
 
-        if constexpr (::std::is_same_v<T, CUfunction>)
+        if constexpr (reserved::is_function_or_kernel_v<T>)
         {
           CUDA_KERNEL_NODE_PARAMS params{
-            .func           = kernel_func,
+            .func           = (CUfunction) kernel_func,
             .gridDimX       = gridDim.x,
             .gridDimY       = gridDim.y,
             .gridDimZ       = gridDim.z,
@@ -164,9 +193,9 @@ struct cuda_kernel_desc
     return ::std::visit(
       [](auto&& kernel_func) {
         using T = ::std::decay_t<decltype(kernel_func)>;
-        if constexpr (::std::is_same_v<T, CUfunction>)
+        if constexpr (reserved::is_function_or_kernel_v<T>)
         {
-          return cuda_try<cuFuncGetAttribute>(CU_FUNC_ATTRIBUTE_NUM_REGS, kernel_func);
+          return cuda_try<cuFuncGetAttribute>(CU_FUNC_ATTRIBUTE_NUM_REGS, (CUfunction) kernel_func);
         }
         else
         {
@@ -187,6 +216,13 @@ private:
     return f;
   }
 
+#if CUDA_VERSION >= 12000
+  static func_variant_t store_func(CUkernel k)
+  {
+    return k;
+  }
+#endif
+
   template <typename T>
   static func_variant_t store_func(T* f)
   {

From b2002d865663a07322ef6954ed9b2e9cfbc9dbe6 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 16:54:52 +0200
Subject: [PATCH 006/221] Add a test with CUfunction and CUkernel

---
 cudax/test/stf/CMakeLists.txt                 |  1 +
 .../test/stf/examples/cuda_kernels_driver.cu  | 90 +++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 cudax/test/stf/examples/cuda_kernels_driver.cu

diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
index 75b271b5865..e594f686e05 100644
--- a/cudax/test/stf/CMakeLists.txt
+++ b/cudax/test/stf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(stf_test_sources
   error_checks/non_managed_data.cu
   error_checks/uninitialized_data.cu
   error_checks/write_frozen.cu
+  examples/cuda_kernels_driver.cu
   examples/05-stencil-no-copy.cu
   examples/05-stencil-places.cu
   examples/05-stencil.cu
diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu
new file mode 100644
index 00000000000..be7ae30f629
--- /dev/null
+++ b/cudax/test/stf/examples/cuda_kernels_driver.cu
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Test that the cuda_kernel construct works with global kernels, CUfunction and CUkernel entries.
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < x.size(); i += nthreads)
+  {
+    y(i) += a * x(i);
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx;
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  double alpha = 3.14;
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  CUfunction axpy_fun;
+  cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void *)axpy));
+
+  // TODO ifdef
+  CUkernel axpy_kernel;
+  cuda_safe_call(cudaGetKernel(&axpy_kernel, (void *)axpy));
+
+  // runtime global kernel
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
+    return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
+  };
+
+  // CUfunction driver API
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY};
+  };
+
+  // CUkernel driver API
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY};
+  };
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + 3.0*alpha * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+}

From 22c758a4ef0c2561c87553b17a7c236fedca4b03 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 17:00:55 +0200
Subject: [PATCH 007/221] Check whether CUkernel is supported

---
 cudax/test/stf/examples/cuda_kernels_driver.cu | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu
index be7ae30f629..ec2ff2ee681 100644
--- a/cudax/test/stf/examples/cuda_kernels_driver.cu
+++ b/cudax/test/stf/examples/cuda_kernels_driver.cu
@@ -52,39 +52,47 @@ int main()
     Y[i] = Y0(i);
   }
 
+  // Number of times we have applied the axpy kernel
+  int num_axpy = 0;
+
   double alpha = 3.14;
 
   auto lX = ctx.logical_data(X);
   auto lY = ctx.logical_data(Y);
 
   CUfunction axpy_fun;
-  cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void *)axpy));
+  cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy));
 
   // TODO ifdef
   CUkernel axpy_kernel;
-  cuda_safe_call(cudaGetKernel(&axpy_kernel, (void *)axpy));
+  cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy));
 
   // runtime global kernel
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
     return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
   };
+  num_axpy++;
 
   // CUfunction driver API
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY};
   };
+  num_axpy++;
 
+#if CUDA_VERSION >= 12000
   // CUkernel driver API
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY};
   };
+  num_axpy++;
+#endif
 
   ctx.finalize();
 
   for (size_t i = 0; i < N; i++)
   {
-    assert(fabs(Y[i] - (Y0(i) + 3.0*alpha * X0(i))) < 0.0001);
+    assert(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001);
     assert(fabs(X[i] - X0(i)) < 0.0001);
   }
 }

From 02ded8b60d01357da12444a43be7b7e4f5324352 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 18:03:13 +0200
Subject: [PATCH 008/221] use _CCCL_ASSERT instead of assert to avoid an unused
 variable error

---
 cudax/test/stf/examples/cuda_kernels_driver.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu
index ec2ff2ee681..4100541f482 100644
--- a/cudax/test/stf/examples/cuda_kernels_driver.cu
+++ b/cudax/test/stf/examples/cuda_kernels_driver.cu
@@ -92,7 +92,7 @@ int main()
 
   for (size_t i = 0; i < N; i++)
   {
-    assert(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001);
-    assert(fabs(X[i] - X0(i)) < 0.0001);
+    _CCCL_ASSERT(fabs(Y[i] - (Y0(i) + num_axpy * alpha * X0(i))) < 0.0001, "Invalid result");
+    _CCCL_ASSERT(fabs(X[i] - X0(i)) < 0.0001, "Invalid result");
   }
 }

From ebeb7031cdba7498a3e1df467accd8e115998dfb Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 18:32:16 +0200
Subject: [PATCH 009/221] cudaGetKernel was added in CUDA 12.1

---
 cudax/test/stf/examples/cuda_kernels_driver.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu
index 4100541f482..76ae872f71b 100644
--- a/cudax/test/stf/examples/cuda_kernels_driver.cu
+++ b/cudax/test/stf/examples/cuda_kernels_driver.cu
@@ -60,13 +60,6 @@ int main()
   auto lX = ctx.logical_data(X);
   auto lY = ctx.logical_data(Y);
 
-  CUfunction axpy_fun;
-  cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy));
-
-  // TODO ifdef
-  CUkernel axpy_kernel;
-  cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy));
-
   // runtime global kernel
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
@@ -74,14 +67,21 @@ int main()
   };
   num_axpy++;
 
+
   // CUfunction driver API
+  CUfunction axpy_fun;
+  cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy));
+
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     return cuda_kernel_desc{axpy_fun, 16, 128, 0, alpha, dX, dY};
   };
   num_axpy++;
 
-#if CUDA_VERSION >= 12000
+#if CUDA_VERSION >= 12010
   // CUkernel driver API
+  CUkernel axpy_kernel;
+  cuda_safe_call(cudaGetKernel(&axpy_kernel, (void*) axpy));
+
   ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
     return cuda_kernel_desc{axpy_kernel, 16, 128, 0, alpha, dX, dY};
   };

From 2f298b7beb6fc1aecb5e0ee9401ae35c13ecfe36 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 18:33:44 +0200
Subject: [PATCH 010/221] clang-format

---
 cudax/test/stf/examples/cuda_kernels_driver.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cudax/test/stf/examples/cuda_kernels_driver.cu b/cudax/test/stf/examples/cuda_kernels_driver.cu
index 76ae872f71b..6443cbdd867 100644
--- a/cudax/test/stf/examples/cuda_kernels_driver.cu
+++ b/cudax/test/stf/examples/cuda_kernels_driver.cu
@@ -67,7 +67,6 @@ int main()
   };
   num_axpy++;
 
-
   // CUfunction driver API
   CUfunction axpy_fun;
   cuda_safe_call(cudaGetFuncBySymbol(&axpy_fun, (void*) axpy));

From 3d9b7a5bafecafe0cee41a1c282fc76461798d85 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 08:42:25 +0200
Subject: [PATCH 011/221] Extract the start and end phase of the ->* operator

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 132 +++++++++++-------
 1 file changed, 83 insertions(+), 49 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 1b3dfcd322a..d34955ab50c 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -209,6 +209,8 @@ struct cuda_kernel_desc
   }
 
 private:
+  // This type-erased smart pointer keeps the argument tuple valid until the
+  // object is destroyed, so that the pointer to these arguments remain valid
   ::std::shared_ptr<void> arg_tuple_type_erased;
 
   static func_variant_t store_func(CUfunction f)
@@ -292,17 +294,13 @@ public:
     return *this;
   }
 
-  /**
-   * @brief Takes a lambda function and executes it on the host in a graph callback node.
-   *
-   * @tparam Fun type of lambda function
-   * @param f Lambda function to execute
-   */
-  template <typename Fun>
-  void operator->*(Fun&& f)
+  auto& start()
   {
     // If a place is specified, use it
-    auto t = e_place ? ctx.task(e_place.value()) : ctx.task();
+    support_task = e_place ? ctx.task(e_place.value()) : ctx.task();
+
+    // Short-hand for more readable code
+    auto& t = *support_task;
 
     // So that we can use get to retrieve dynamic dependencies
     untyped_t = t;
@@ -321,53 +319,18 @@ public:
       t.set_symbol(symbol);
     }
 
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
-
-    cudaEvent_t start_event, end_event;
-    const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
+    // Do we need to measure the duration of the kernel(s) ?
+    auto& statistics   = reserved::task_statistics::instance();
+    record_time        = t.schedule_task() || statistics.is_calibrating_to_file();
+    record_time_device = -1;
 
     t.start();
 
-    int device = -1;
-
-    SCOPE(exit)
-    {
-      t.end_uncleared();
-
-      if constexpr (::std::is_same_v<Ctx, stream_ctx>)
-      {
-        if (record_time)
-        {
-          cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
-          cuda_safe_call(cudaEventSynchronize(end_event));
-
-          float milliseconds = 0;
-          cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));
-
-          if (dot.is_tracing())
-          {
-            dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, device);
-          }
-
-          if (statistics.is_calibrating())
-          {
-            statistics.log_task_time(t, milliseconds);
-          }
-        }
-      }
-
-      t.clear();
-
-      // Now that we have executed 'f', we do not need to access it anymore
-      untyped_t.reset();
-    };
-
     if constexpr (::std::is_same_v<Ctx, stream_ctx>)
     {
       if (record_time)
       {
-        cuda_safe_call(cudaGetDevice(&device)); // We will use this to force it during the next run
+        cuda_safe_call(cudaGetDevice(&record_time_device)); // We will use this to force it during the next run
         // Events must be created here to avoid issues with multi-gpu
         cuda_safe_call(cudaEventCreate(&start_event));
         cuda_safe_call(cudaEventCreate(&end_event));
@@ -375,11 +338,71 @@ public:
       }
     }
 
+    auto& dot = *ctx.get_dot();
     if (dot.is_tracing())
     {
       dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
     }
 
+    return *this;
+  }
+
+  auto& end()
+  {
+    auto& t = *support_task;
+
+    // We need to access the task structures (eg. to get the stream) so we do
+    // not clear all its resources yet.
+    t.end_uncleared();
+
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
+        cuda_safe_call(cudaEventSynchronize(end_event));
+
+        float milliseconds = 0;
+        cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));
+
+        auto& dot = *ctx.get_dot();
+        if (dot.is_tracing())
+        {
+          dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, record_time_device);
+        }
+
+        auto& statistics = reserved::task_statistics::instance();
+        if (statistics.is_calibrating())
+        {
+          statistics.log_task_time(t, milliseconds);
+        }
+      }
+    }
+
+    t.clear();
+
+    // Now that we have executed 'f', we do not need to access it anymore
+    untyped_t.reset();
+
+    return *this;
+  }
+
+  /**
+   * @brief Takes a lambda function and executes it on the host in a graph callback node.
+   *
+   * @tparam Fun type of lambda function
+   * @param f Lambda function to execute
+   */
+  template <typename Fun>
+  void operator->*(Fun&& f)
+  {
+    start();
+
+    SCOPE(exit)
+    {
+      end();
+    };
+
     // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers
     // used by the kernel(s)
     static bool display_register_cnt = [] {
@@ -387,6 +410,8 @@ public:
       return env && (atoi(env) != 0);
     }();
 
+    auto& t = *support_task;
+
     // When chained is enable, we expect a vector of kernel description which should be executed one after the other
     if constexpr (chained)
     {
@@ -469,12 +494,21 @@ private:
   // Statically defined deps
   task_dep_vector<Deps...> deps;
 
+  // To store a task that implements cuda_kernel(_chain)
+  using underlying_task_type = decltype(::std::declval<Ctx>().task());
+  ::std::optional<underlying_task_type> support_task;
+
   // Dependencies added with add_deps
   ::std::vector<task_dep_untyped> dynamic_deps;
   // Used to retrieve deps with t.get<>(...)
   ::std::optional<task> untyped_t;
 
   ::std::optional<exec_place> e_place;
+
+  // Are we making some measurements ?
+  bool record_time;
+  int record_time_device;
+  cudaEvent_t start_event, end_event;
 };
 
 } // end namespace reserved

From ac92c82e33e13fa1887317d99a86700338c2a847 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 08:54:51 +0200
Subject: [PATCH 012/221] There is no need to store untyped_t as we now store
 the task with its type

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index d34955ab50c..7f9267b0aa3 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -275,8 +275,8 @@ public:
   template <typename T>
   decltype(auto) get(size_t submitted_index) const
   {
-    _CCCL_ASSERT(untyped_t.has_value(), "uninitialized task");
-    return untyped_t->template get<T>(submitted_index);
+    _CCCL_ASSERT(support_task.has_value(), "uninitialized task");
+    return support_task->template get<T>(submitted_index);
   }
 
   /**
@@ -299,12 +299,8 @@ public:
     // If a place is specified, use it
     support_task = e_place ? ctx.task(e_place.value()) : ctx.task();
 
-    // Short-hand for more readable code
     auto& t = *support_task;
 
-    // So that we can use get to retrieve dynamic dependencies
-    untyped_t = t;
-
     t.add_deps(deps);
 
     // Append all dynamic deps
@@ -381,8 +377,9 @@ public:
 
     t.clear();
 
-    // Now that we have executed 'f', we do not need to access it anymore
-    untyped_t.reset();
+    // Do release to the task structure as we don't need to reference it when
+    // we have called end()
+    support_task.reset();
 
     return *this;
   }
@@ -494,14 +491,14 @@ private:
   // Statically defined deps
   task_dep_vector<Deps...> deps;
 
-  // To store a task that implements cuda_kernel(_chain)
+  // To store a task that implements cuda_kernel(_chain). Note that we do not
+  // store the task with Deps... but a "dynamic" task where all dependencies
+  // are added using add_deps.
   using underlying_task_type = decltype(::std::declval<Ctx>().task());
   ::std::optional<underlying_task_type> support_task;
 
   // Dependencies added with add_deps
   ::std::vector<task_dep_untyped> dynamic_deps;
-  // Used to retrieve deps with t.get<>(...)
-  ::std::optional<task> untyped_t;
 
   ::std::optional<exec_place> e_place;
 

From 5017413eff7de4a40a3956c8b97fdb41d4488d95 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 09:38:22 +0200
Subject: [PATCH 013/221] Implement the low level interface for
 cuda_kernel(_chain) with a way to avoid using the ->* operator

---
 .../experimental/__stf/internal/context.cuh   |  25 ++++
 .../__stf/internal/cuda_kernel_scope.cuh      | 139 +++++++++++-------
 cudax/test/stf/CMakeLists.txt                 |   1 +
 .../cuda_kernel_chain-add_deps_low_level.cu   |  86 +++++++++++
 4 files changed, 195 insertions(+), 56 deletions(-)
 create mode 100644 cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index e04b57ade57..00362b67a1d 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -127,6 +127,15 @@ class context
       return *this;
     }
 
+    template <typename... Args>
+    auto& add_kernel_desc(Args&&... args)
+    {
+      payload->*[&](auto& self) {
+        self.add_kernel_desc(::std::forward<Args>(args)...);
+      };
+      return *this;
+    }
+
     template <typename T>
     decltype(auto) get(size_t submitted_index) const
     {
@@ -135,6 +144,22 @@ class context
       };
     }
 
+    auto& start()
+    {
+      payload->*[&](auto& self) {
+        self.start();
+      };
+      return *this;
+    }
+
+    auto& end()
+    {
+      payload->*[&](auto& self) {
+        self.end();
+      };
+      return *this;
+    }
+
   private:
     ::std::variant<T1, T2> payload;
   };
diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index 7f9267b0aa3..a8fdffb2e8a 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -347,6 +347,9 @@ public:
   {
     auto& t = *support_task;
 
+    // Do submit kernels
+    launch_kernels();
+
     // We need to access the task structures (eg. to get the stream) so we do
     // not clear all its resources yet.
     t.end_uncleared();
@@ -400,6 +403,49 @@ public:
       end();
     };
 
+    auto& t = *support_task;
+
+    // Get the vector of kernel(s) to perform
+    // When chained is enable, we expect a vector of kernel description which
+    // should be executed one after the other.
+    if constexpr (chained)
+    {
+      kernel_descs = ::std::apply(f, deps.instance(t));
+      assert(!kernel_descs.empty());
+    }
+    else
+    {
+      // We have an unchained cuda_kernel, which means there is a single
+      // CUDA kernel described, and the function should return a single
+      // descriptor, not a vector
+      static_assert(!chained);
+
+      cuda_kernel_desc res = ::std::apply(f, deps.instance(t));
+      kernel_descs.push_back(res);
+    }
+  }
+
+  // Manually add one kernel
+  auto& add_kernel_desc(cuda_kernel_desc d)
+  {
+    kernel_descs.push_back(mv(d));
+    return *this;
+  }
+
+  // Manually add a vector of kernels
+  auto& add_kernel_desc(const ::std::vector<cuda_kernel_desc>& descs)
+  {
+    for (const auto& d : descs)
+    {
+      add_kernel_desc(d);
+    }
+    return *this;
+  }
+
+private:
+  // This does submit all kernels and print statistics if needed
+  void launch_kernels()
+  {
     // If CUDASTF_CUDA_KERNEL_DEBUG is set, we display the number of registers
     // used by the kernel(s)
     static bool display_register_cnt = [] {
@@ -407,85 +453,62 @@ public:
       return env && (atoi(env) != 0);
     }();
 
-    auto& t = *support_task;
-
-    // When chained is enable, we expect a vector of kernel description which should be executed one after the other
-    if constexpr (chained)
+    // Print some statistics if needed
+    if (display_register_cnt)
     {
-      ::std::vector<cuda_kernel_desc> res = ::std::apply(f, deps.instance(t));
-      assert(!res.empty());
-
-      if (display_register_cnt)
+      if (kernel_descs.size() > 1)
       {
         fprintf(stderr, "cuda_kernel_chain (%s):\n", symbol.c_str());
-        for (size_t i = 0; i < res.size(); i++)
+        for (size_t i = 0; i < kernel_descs.size(); i++)
         {
-          fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, res[i].get_num_registers());
+          fprintf(stderr, "- kernel %ld uses %d register(s)\n", i, kernel_descs[i].get_num_registers());
         }
       }
-
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
+      else
       {
-        auto lock = t.lock_ctx_graph();
-        auto& g   = t.get_ctx_graph();
+        fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), kernel_descs[0].get_num_registers());
+      }
+    }
 
-        // We have two situations : either there is a single kernel and we put the kernel in the context's
-        // graph, or we rely on a child graph
-        if (res.size() == 1)
-        {
-          res[0].launch_in_graph(t.get_node(), g);
-        }
-        else
-        {
-          ::std::vector<cudaGraphNode_t>& chain = t.get_node_chain();
-          chain.resize(res.size());
+    auto& t = *support_task;
 
-          // Create a chain of kernels
-          for (size_t i = 0; i < res.size(); i++)
-          {
-            res[i].launch_in_graph(chain[i], g);
-            if (i > 0)
-            {
-              cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1));
-            }
-          }
-        }
+    if constexpr (::std::is_same_v<Ctx, graph_ctx>)
+    {
+      auto lock = t.lock_ctx_graph();
+      auto& g   = t.get_ctx_graph();
+
+      // We have two situations : either there is a single kernel and we put the kernel in the context's
+      // graph, or we rely on a child graph
+      if (kernel_descs.size() == 1)
+      {
+        kernel_descs[0].launch_in_graph(t.get_node(), g);
       }
       else
       {
-        // Rely on stream semantic to have a dependency between the kernels
-        for (auto& k : res)
+        ::std::vector<cudaGraphNode_t>& chain = t.get_node_chain();
+        chain.resize(kernel_descs.size());
+
+        // Create a chain of kernels
+        for (size_t i = 0; i < kernel_descs.size(); i++)
         {
-          k.launch(t.get_stream());
+          kernel_descs[i].launch_in_graph(chain[i], g);
+          if (i > 0)
+          {
+            cuda_safe_call(cudaGraphAddDependencies(g, &chain[i - 1], &chain[i], 1));
+          }
         }
       }
     }
     else
     {
-      // We have an unchained cuda_kernel, which means there is a single
-      // CUDA kernel described, and the function should return a single
-      // descriptor, not a vector
-      static_assert(!chained);
-
-      cuda_kernel_desc res = ::std::apply(f, deps.instance(t));
-      if (display_register_cnt)
+      // Rely on stream semantic to have a dependency between the kernels
+      for (auto& k : kernel_descs)
       {
-        fprintf(stderr, "cuda_kernel (%s): uses %d register(s)\n", symbol.c_str(), res.get_num_registers());
-      }
-
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
-      {
-        auto lock = t.lock_ctx_graph();
-        res.launch_in_graph(t.get_node(), t.get_ctx_graph());
-      }
-      else
-      {
-        res.launch(t.get_stream());
+        k.launch(t.get_stream());
       }
     }
   }
 
-private:
   ::std::string symbol;
   Ctx& ctx;
   // Statically defined deps
@@ -502,6 +525,10 @@ private:
 
   ::std::optional<exec_place> e_place;
 
+  // What kernel(s) must be done ? We also store this in a vector if there is a
+  // single kernel (with the cuda_kernel construct)
+  ::std::vector<cuda_kernel_desc> kernel_descs;
+
   // Are we making some measurements ?
   bool record_time;
   int record_time_device;
diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
index e594f686e05..dfa935f8862 100644
--- a/cudax/test/stf/CMakeLists.txt
+++ b/cudax/test/stf/CMakeLists.txt
@@ -34,6 +34,7 @@ set(stf_test_sources
   graph/static_graph_ctx.cu
   hashtable/test.cu
   interface/cuda_kernel_chain-add_deps.cu
+  interface/cuda_kernel_chain-add_deps_low_level.cu
   interface/data_from_device_async.cu
   interface/move_operator.cu
   local_stf/legacy_to_stf.cu
diff --git a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu
new file mode 100644
index 00000000000..95a50c151ea
--- /dev/null
+++ b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example of task implementing a chain of CUDA kernels with dynamic dependencies (add_deps)
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < x.size(); i += nthreads)
+  {
+    y(i) += a * x(i);
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx    = graph_ctx();
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  double alpha = 3.14;
+  double beta  = 4.5;
+  double gamma = -4.1;
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  /* Compute Y = Y + alpha X, Y = Y + beta X and then  Y = Y + gamma X */
+  auto t = ctx.cuda_kernel_chain();
+  t.add_deps(lX.read());
+  t.add_deps(lY.rw());
+  t.start();
+  auto dX = t.template get<slice<double>>(0);
+  auto dY = t.template get<slice<double>>(1);
+  // clang-format off
+  auto descs = ::std::vector<cuda_kernel_desc> {
+        { axpy, 16, 128, 0, alpha, dX, dY },
+        { axpy, 16, 128, 0, beta, dX, dY },
+        { axpy, 16, 128, 0, gamma, dX, dY }
+    };
+  // clang-format on
+  t.add_kernel_desc(descs);
+  t.end();
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+}

From 1dd1ff50b76d4a1edc300070324cf1343e8df166 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 12:48:26 +0200
Subject: [PATCH 014/221] - Add a test to ensure we can put no arguments in the
 cuda_kernel_desc constructor - Implement a low level API to describe
 cuda_kernel_desc with an array of   pointers rather than a variadic interface
 (and use it in a test)

---
 .../__stf/internal/cuda_kernel_scope.cuh      | 46 ++++++++++++++--
 cudax/test/stf/CMakeLists.txt                 |  1 +
 .../cuda_kernel_chain-add_deps_low_level.cu   | 17 +++---
 .../stf/interface/cuda_kernel_empty_args.cu   | 55 +++++++++++++++++++
 4 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 cudax/test/stf/interface/cuda_kernel_empty_args.cu

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index a8fdffb2e8a..f22ec6a5da0 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -67,15 +67,26 @@ inline constexpr bool is_function_or_kernel_v = is_function_or_kernel<T>::value;
  */
 struct cuda_kernel_desc
 {
+  cuda_kernel_desc() = default;
+
   template <typename Fun, typename... Args>
   cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
-      : func_variant(store_func(mv(func)))
-      , gridDim(gridDim_)
-      , blockDim(blockDim_)
-      , sharedMem(sharedMem_)
+  {
+    configure(mv(func), gridDim_, blockDim_, sharedMem_, ::std::forward<Args>(args)...);
+  }
+
+  template <typename Fun, typename... Args>
+  void configure(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
   {
     using TupleType = ::std::tuple<::std::decay_t<Args>...>;
 
+    _CCCL_ASSERT(!configured, "cuda_kernel_desc was already configured");
+
+    func_variant = store_func(mv(func));
+    gridDim      = gridDim_;
+    blockDim     = blockDim_;
+    sharedMem    = sharedMem_;
+
     // We first copy all arguments into a tuple because the kernel
     // implementation needs pointers to the argument, so we cannot use
     // directly those passed in the pack of arguments
@@ -98,6 +109,30 @@ struct cuda_kernel_desc
 
     // Save the tuple in a typed erased value
     arg_tuple_type_erased = mv(arg_tuple);
+
+    configured = true;
+  }
+
+  // It is the responsibility of the caller to unsure arguments are valid until
+  // the CUDA kernel construct ends
+  template <typename Fun>
+  void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args)
+  {
+    _CCCL_ASSERT(!configured, "cuda_kernel_desc was already configured");
+
+    func_variant = store_func(mv(func));
+    gridDim      = gridDim_;
+    blockDim     = blockDim_;
+    sharedMem    = sharedMem_;
+
+    for (int i = 0; i < arg_cnt; i++)
+    {
+      // We can safely forget the const here because CUDA will not modify the
+      // argument
+      args_ptr.push_back(const_cast<void*>(args[i]));
+    }
+
+    configured = true;
   }
 
   /* CUfunction/CUkernel (CUDA driver API) or __global__ function (CUDA runtime API) */
@@ -230,6 +265,9 @@ private:
   {
     return reinterpret_cast<const void*>(f);
   }
+
+  // We can only configure the kernel descriptor once
+  bool configured = false;
 };
 
 namespace reserved
diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
index dfa935f8862..eab95ea5e82 100644
--- a/cudax/test/stf/CMakeLists.txt
+++ b/cudax/test/stf/CMakeLists.txt
@@ -35,6 +35,7 @@ set(stf_test_sources
   hashtable/test.cu
   interface/cuda_kernel_chain-add_deps.cu
   interface/cuda_kernel_chain-add_deps_low_level.cu
+  interface/cuda_kernel_empty_args.cu
   interface/data_from_device_async.cu
   interface/move_operator.cu
   local_stf/legacy_to_stf.cu
diff --git a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu
index 95a50c151ea..e95820b9e60 100644
--- a/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu
+++ b/cudax/test/stf/interface/cuda_kernel_chain-add_deps_low_level.cu
@@ -66,13 +66,16 @@ int main()
   t.start();
   auto dX = t.template get<slice<double>>(0);
   auto dY = t.template get<slice<double>>(1);
-  // clang-format off
-  auto descs = ::std::vector<cuda_kernel_desc> {
-        { axpy, 16, 128, 0, alpha, dX, dY },
-        { axpy, 16, 128, 0, beta, dX, dY },
-        { axpy, 16, 128, 0, gamma, dX, dY }
-    };
-  // clang-format on
+  ::std::vector<cuda_kernel_desc> descs;
+  descs.resize(3);
+  // Configure with types
+  descs[0].configure(axpy, 16, 128, 0, alpha, dX, dY);
+  descs[1].configure(axpy, 16, 128, 0, beta, dX, dY);
+
+  // Configure with low level API
+  const void* args[3] = {&gamma, &dX, &dY};
+  descs[2].configure_raw(axpy, 16, 128, 0, 3, args);
+
   t.add_kernel_desc(descs);
   t.end();
 
diff --git a/cudax/test/stf/interface/cuda_kernel_empty_args.cu b/cudax/test/stf/interface/cuda_kernel_empty_args.cu
new file mode 100644
index 00000000000..c9aeb9740cb
--- /dev/null
+++ b/cudax/test/stf/interface/cuda_kernel_empty_args.cu
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Make sure we can compile with no arguments in CUDA kernel
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void dummy() {}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx    = graph_ctx();
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  // Ensure this works without arguments in the kernel
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto, auto) {
+    return cuda_kernel_desc{dummy, 16, 128, 0};
+  };
+
+  ctx.finalize();
+}

From acb2b371813f8c9b1bf5b4e40fbb8afe065f769f Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 23:21:43 +0200
Subject: [PATCH 015/221] Start to implement C bindings for CUDASTF (this will
 later be used in conjunction with cython to generate python support)

---
 c/CMakeLists.txt                              |  1 +
 c/experimental/stf/CMakeLists.txt             | 60 +++++++++++++++++++
 .../stf/include/cccl/c/experimental/stf/stf.h | 22 +++++++
 c/experimental/stf/src/stf.cu                 | 26 ++++++++
 4 files changed, 109 insertions(+)
 create mode 100644 c/experimental/stf/CMakeLists.txt
 create mode 100644 c/experimental/stf/include/cccl/c/experimental/stf/stf.h
 create mode 100644 c/experimental/stf/src/stf.cu

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 7f1dbf4507b..364494da7a0 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(parallel)
+add_subdirectory(experimental/stf/)
diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
new file mode 100644
index 00000000000..da4985491c7
--- /dev/null
+++ b/c/experimental/stf/CMakeLists.txt
@@ -0,0 +1,60 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(CCCL_C_EXPERIMENTAL_STF LANGUAGES CUDA CXX C)
+
+option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tests." OFF)
+
+# FIXME Ideally this would be handled by presets and install rules, but for now
+# consumers may override this to control the target location of cccl.c.experimental.stf.
+set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library")
+mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+
+file(GLOB_RECURSE srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  "src/*.cu" "src/*.cuh"
+)
+
+add_library(cccl.c.experimental.stf SHARED ${srcs})
+set_property(TARGET cccl.c.experimental.stf PROPERTY POSITION_INDEPENDENT_CODE ON)
+cccl_configure_target(cccl.c.experimental.stf DIALECT 17)
+
+# Override the properties set by cccl_configure_target:
+if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
+  set_target_properties(cccl.c.parallel PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
+    ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
+  )
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+target_link_libraries(cccl.c.experimental.stf PRIVATE
+  CUDA::cudart_static
+  CUDA::nvrtc
+  CUDA::nvJitLink
+  CUDA::cuda_driver
+  cccl.compiler_interface_cpp20
+  cccl.c.parallel.jit_template
+  CUB::CUB
+  Thrust::Thrust
+  nlohmann_json::nlohmann_json
+)
+# target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
+# target_compile_definitions(cccl.c.experimental.stf PRIVATE
+#   NVRTC_GET_TYPE_NAME=1
+#   CUB_DISABLE_CDP=1
+#   CUB_DEFINE_RUNTIME_POLICIES
+# )
+target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+
+target_include_directories(cccl.c.experimental.stf PUBLIC "include")
+target_include_directories(cccl.c.experimental.stf PRIVATE "src")
+
+if (CCCL_C_Parallel_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
+  include(cmake/CParallelHeaderTesting.cmake)
+endif()
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
new file mode 100644
index 00000000000..126cc9424c7
--- /dev/null
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -0,0 +1,22 @@
+// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef struct stf_ctx_handle stf_ctx_handle;
+
+void stf_ctx_create(stf_ctx_handle *handle);
+void stf_ctx_finalize(stf_ctx_handle *handle);
+
+struct stf_task_handle {
+    void *handle;
+};
+
+struct stf_logical_data_handle {
+    void *handle;
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
new file mode 100644
index 00000000000..44a609b25fb
--- /dev/null
+++ b/c/experimental/stf/src/stf.cu
@@ -0,0 +1,26 @@
+#include <cccl/c/experimental/stf/stf.h>
+//#include <cccl/c/parallel/include/cccl/c/extern_c.h>
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+extern "C"
+{
+
+struct stf_ctx_handle {
+    context *ctx;
+};
+
+void stf_ctx_create(stf_ctx_handle *handle)
+{
+    return new context{};
+}
+
+void stf_ctx_finalize(stf_ctx_handle *handle)
+{
+    if (handle) {
+        handle->finalize();
+    }
+}
+
+}

From acdbd2c3254312a70b9ec3070be5598c7bb85020 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 23:38:54 +0200
Subject: [PATCH 016/221] clang-format

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 17 ++++++-------
 c/experimental/stf/src/stf.cu                 | 24 +++++++++----------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 126cc9424c7..c4e2b3321a2 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1,20 +1,21 @@
 // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif
 
 typedef struct stf_ctx_handle stf_ctx_handle;
 
-void stf_ctx_create(stf_ctx_handle *handle);
-void stf_ctx_finalize(stf_ctx_handle *handle);
+void stf_ctx_create(stf_ctx_handle* handle);
+void stf_ctx_finalize(stf_ctx_handle* handle);
 
-struct stf_task_handle {
-    void *handle;
+struct stf_task_handle
+{
+  void* handle;
 };
 
-struct stf_logical_data_handle {
-    void *handle;
+struct stf_logical_data_handle
+{
+  void* handle;
 };
 
 #ifdef __cplusplus
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 44a609b25fb..ea07fe9e173 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -1,26 +1,26 @@
 #include <cccl/c/experimental/stf/stf.h>
-//#include <cccl/c/parallel/include/cccl/c/extern_c.h>
+// #include <cccl/c/parallel/include/cccl/c/extern_c.h>
 #include <cuda/experimental/stf.cuh>
 
 using namespace cuda::experimental::stf;
 
-extern "C"
-{
+extern "C" {
 
-struct stf_ctx_handle {
-    context *ctx;
+struct stf_ctx_handle
+{
+  context* ctx;
 };
 
-void stf_ctx_create(stf_ctx_handle *handle)
+void stf_ctx_create(stf_ctx_handle* handle)
 {
-    return new context{};
+  return new context{};
 }
 
-void stf_ctx_finalize(stf_ctx_handle *handle)
+void stf_ctx_finalize(stf_ctx_handle* handle)
 {
-    if (handle) {
-        handle->finalize();
-    }
+  if (handle)
+  {
+    handle->finalize();
+  }
 }
-
 }

From 6a0bec1ed3c168264a4a0ddec23ebe2e6c4ee8f2 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 18 Jul 2025 23:45:34 +0200
Subject: [PATCH 017/221] we do not have these tests yet

---
 c/experimental/stf/CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index da4985491c7..c692f4da2a1 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -51,10 +51,10 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:C
 target_include_directories(cccl.c.experimental.stf PUBLIC "include")
 target_include_directories(cccl.c.experimental.stf PRIVATE "src")
 
-if (CCCL_C_Parallel_ENABLE_TESTING)
-  add_subdirectory(test)
-endif()
-
-if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
-  include(cmake/CParallelHeaderTesting.cmake)
-endif()
+# if (CCCL_C_Parallel_ENABLE_TESTING)
+#   add_subdirectory(test)
+# endif()
+#
+# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
+#   include(cmake/CParallelHeaderTesting.cmake)
+# endif()

From 4e70a639a2e65ee0fd20ea21d2e3d0ef216bedab Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 19 Jul 2025 00:01:01 +0200
Subject: [PATCH 018/221] Misc fixes for cccl.c.experimental.stf

---
 c/experimental/stf/CMakeLists.txt | 8 +++-----
 c/experimental/stf/src/stf.cu     | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index c692f4da2a1..4d9d5dafcb0 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -34,11 +34,7 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::nvrtc
   CUDA::nvJitLink
   CUDA::cuda_driver
-  cccl.compiler_interface_cpp20
-  cccl.c.parallel.jit_template
-  CUB::CUB
-  Thrust::Thrust
-  nlohmann_json::nlohmann_json
+  CCCL::cudax
 )
 # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 # target_compile_definitions(cccl.c.experimental.stf PRIVATE
@@ -46,6 +42,8 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE
 #   CUB_DISABLE_CDP=1
 #   CUB_DEFINE_RUNTIME_POLICIES
 # )
+
+target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
 target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
 
 target_include_directories(cccl.c.experimental.stf PUBLIC "include")
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index ea07fe9e173..f827481f4e5 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -13,14 +13,14 @@ struct stf_ctx_handle
 
 void stf_ctx_create(stf_ctx_handle* handle)
 {
-  return new context{};
+  handle->ctx = new context{};
 }
 
 void stf_ctx_finalize(stf_ctx_handle* handle)
 {
   if (handle)
   {
-    handle->finalize();
+    handle->ctx->finalize();
   }
 }
 }

From e2a0d6164fdf7cfd17e0b664600701122dd5a1de Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 19 Jul 2025 00:35:35 +0200
Subject: [PATCH 019/221] Add one test and redefine the ctx handle

---
 c/experimental/stf/CMakeLists.txt             |  8 ++--
 .../stf/include/cccl/c/experimental/stf/stf.h |  4 +-
 c/experimental/stf/src/stf.cu                 | 15 ++++---
 c/experimental/stf/test/CMakeLists.txt        | 39 +++++++++++++++++++
 c/experimental/stf/test/test_ctx.cpp          | 21 ++++++++++
 5 files changed, 73 insertions(+), 14 deletions(-)
 create mode 100644 c/experimental/stf/test/CMakeLists.txt
 create mode 100644 c/experimental/stf/test/test_ctx.cpp

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 4d9d5dafcb0..5e8aa8e9f50 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -49,10 +49,10 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:C
 target_include_directories(cccl.c.experimental.stf PUBLIC "include")
 target_include_directories(cccl.c.experimental.stf PRIVATE "src")
 
-# if (CCCL_C_Parallel_ENABLE_TESTING)
-#   add_subdirectory(test)
-# endif()
-#
+if (CCCL_C_Parallel_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
 # if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
 #   include(cmake/CParallelHeaderTesting.cmake)
 # endif()
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index c4e2b3321a2..f19a9e75b91 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -3,10 +3,10 @@
 extern "C" {
 #endif
 
-typedef struct stf_ctx_handle stf_ctx_handle;
+typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* handle);
-void stf_ctx_finalize(stf_ctx_handle* handle);
+void stf_ctx_finalize(stf_ctx_handle handle);
 
 struct stf_task_handle
 {
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index f827481f4e5..578757fc13e 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -6,21 +6,20 @@ using namespace cuda::experimental::stf;
 
 extern "C" {
 
-struct stf_ctx_handle
+struct stf_ctx_handle_t
 {
-  context* ctx;
+  context ctx;
 };
 
 void stf_ctx_create(stf_ctx_handle* handle)
 {
-  handle->ctx = new context{};
+  if (handle) {
+    *handle = new stf_ctx_handle_t{context{}};
+  }
 }
 
-void stf_ctx_finalize(stf_ctx_handle* handle)
+void stf_ctx_finalize(stf_ctx_handle handle)
 {
-  if (handle)
-  {
-    handle->ctx->finalize();
-  }
+  delete handle;
 }
 }
diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt
new file mode 100644
index 00000000000..f5613253a81
--- /dev/null
+++ b/c/experimental/stf/test/CMakeLists.txt
@@ -0,0 +1,39 @@
+cccl_get_c2h()
+
+function(cccl_c_experimental_stf_add_test target_name_var source)
+  string(REGEX REPLACE "test_([^.]*)" "cccl.c.experimental.stf.test.\\1" target_name "${source}")
+  set(target_name_var ${target_name} PARENT_SCOPE)
+
+  add_executable(${target_name} "${source}")
+  cccl_configure_target(${target_name} DIALECT 20)
+
+  set_target_properties(${target_name} PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+  target_link_libraries(${target_name} PRIVATE
+    cccl.c.experimental.stf
+    CUDA::cudart_static
+    CUDA::nvrtc
+    cccl.c2h.main
+    cccl.compiler_interface_cpp20
+    CUDA::cuda_driver
+    CCCL::cudax
+  )
+
+  target_compile_definitions(${target_name} PRIVATE
+    TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+    TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust"
+    TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+    TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
+  )
+
+  add_test(NAME ${target_name} COMMAND ${target_name})
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(test_src IN LISTS test_srcs)
+  cccl_c_experimental_stf_add_test(test_target "${test_src}")
+endforeach()
diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp
new file mode 100644
index 00000000000..6bbfc3d1e46
--- /dev/null
+++ b/c/experimental/stf/test/test_ctx.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include <c2h/catch2_test_helper.h>
+
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("basic stf context", "[context]")
+{
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+  stf_ctx_finalize(ctx);
+}

From 7a6ea62e49f2367c21993ee921b7e46cffd52160 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 19 Jul 2025 08:50:43 +0200
Subject: [PATCH 020/221] logical_data wrapper

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 17 ++++----
 c/experimental/stf/src/stf.cu                 | 34 +++++++++++++---
 c/experimental/stf/test/test_logical_data.cpp | 39 +++++++++++++++++++
 3 files changed, 75 insertions(+), 15 deletions(-)
 create mode 100644 c/experimental/stf/test/test_logical_data.cpp

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index f19a9e75b91..d21cea1563a 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -5,18 +5,15 @@ extern "C" {
 
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
-void stf_ctx_create(stf_ctx_handle* handle);
-void stf_ctx_finalize(stf_ctx_handle handle);
+void stf_ctx_create(stf_ctx_handle* ctx);
+void stf_ctx_finalize(stf_ctx_handle ctx);
 
-struct stf_task_handle
-{
-  void* handle;
-};
+typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 
-struct stf_logical_data_handle
-{
-  void* handle;
-};
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz);
+void stf_logical_data_destroy(stf_ctx_handle ctx, stf_logical_data_handle ld);
+
+typedef struct stf_task_handle_t* stf_task_handle;
 
 #ifdef __cplusplus
 }
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 578757fc13e..f18ad9f22a0 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -11,15 +11,39 @@ struct stf_ctx_handle_t
   context ctx;
 };
 
-void stf_ctx_create(stf_ctx_handle* handle)
+void stf_ctx_create(stf_ctx_handle* ctx)
 {
-  if (handle) {
-    *handle = new stf_ctx_handle_t{context{}};
+  if (ctx) {
+    *ctx = new stf_ctx_handle_t{context{}};
   }
 }
 
-void stf_ctx_finalize(stf_ctx_handle handle)
+void stf_ctx_finalize(stf_ctx_handle ctx)
 {
-  delete handle;
+  delete ctx;
 }
+
+struct stf_logical_data_handle_t
+{
+  // XXX should we always store a logical_data<slice<char>> instead ?
+  logical_data_untyped ld;
+};
+
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz)
+{
+   assert(ld);
+   assert(ctx);
+
+   // Create a slice<char> logical data
+   auto ld_typed = ctx->ctx.logical_data(make_slice((char *)addr, sz));
+
+   // Stored in its untyped version
+   *ld = new stf_logical_data_handle_t{ld_typed};
+}
+
+void stf_logical_data_destroy(stf_ctx_handle /* ctx */, stf_logical_data_handle ld)
+{
+    delete ld;
+}
+
 }
diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp
new file mode 100644
index 00000000000..e91509708fb
--- /dev/null
+++ b/c/experimental/stf/test/test_logical_data.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include <c2h/catch2_test_helper.h>
+
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("basic stf logical_data", "[logical_data]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lA, lB;
+
+  float *A, *B;
+  A = (float *)malloc(N*sizeof(float));
+  B = (float *)malloc(N*sizeof(float));
+
+  stf_logical_data(ctx, &lA, A, N*sizeof(float));
+  stf_logical_data(ctx, &lB, B, N*sizeof(float));
+
+  stf_logical_data_destroy(ctx, lA);
+  stf_logical_data_destroy(ctx, lB);
+
+  stf_ctx_finalize(ctx);
+
+  free(A);
+  free(B);
+}

From 5fa13d37e0fc82352919fb88e76a7c0abd6e648c Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 19 Jul 2025 09:41:32 +0200
Subject: [PATCH 021/221] More bindings

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 25 +++++-
 c/experimental/stf/src/stf.cu                 | 83 +++++++++++++++----
 c/experimental/stf/test/test_ctx.cpp          |  2 +-
 c/experimental/stf/test/test_logical_data.cpp | 14 ++--
 c/experimental/stf/test/test_task.cpp         | 78 +++++++++++++++++
 5 files changed, 178 insertions(+), 24 deletions(-)
 create mode 100644 c/experimental/stf/test/test_task.cpp

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index d21cea1563a..50aca91b29d 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1,8 +1,18 @@
+#include <cuda_runtime.h>
+
 // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef enum stf_access_mode
+{
+  STF_NONE  = 0,
+  STF_READ  = 1 << 0,
+  STF_WRITE = 1 << 1,
+  STF_RW    = STF_READ | STF_WRITE
+} stf_access_mode;
+
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
@@ -10,11 +20,22 @@ void stf_ctx_finalize(stf_ctx_handle ctx);
 
 typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 
-void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz);
-void stf_logical_data_destroy(stf_ctx_handle ctx, stf_logical_data_handle ld);
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
+void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
+void stf_logical_data_destroy(stf_logical_data_handle ld);
+
+// TODO token
 
 typedef struct stf_task_handle_t* stf_task_handle;
 
+void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
+void stf_task_set_symbol(stf_task_handle t, const char* symbol);
+void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
+void stf_task_start(stf_task_handle t);
+void stf_task_end(stf_task_handle t);
+cudaStream_t stf_task_get_stream(stf_task_handle t);
+void stf_task_destroy(stf_task_handle t);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index f18ad9f22a0..8b10af298a6 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -11,39 +11,94 @@ struct stf_ctx_handle_t
   context ctx;
 };
 
+struct stf_logical_data_handle_t
+{
+  // XXX should we always store a logical_data<slice<char>> instead ?
+  logical_data_untyped ld;
+};
+
+struct stf_task_handle_t
+{
+  context::unified_task<> t;
+};
+
 void stf_ctx_create(stf_ctx_handle* ctx)
 {
-  if (ctx) {
+  if (ctx)
+  {
     *ctx = new stf_ctx_handle_t{context{}};
   }
 }
 
 void stf_ctx_finalize(stf_ctx_handle ctx)
 {
+  assert(ctx);
   delete ctx;
 }
 
-struct stf_logical_data_handle_t
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
 {
-  // XXX should we always store a logical_data<slice<char>> instead ?
-  logical_data_untyped ld;
-};
+  assert(ld);
+  assert(ctx);
+
+  // Create a slice<char> logical data
+  auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz));
+
+  // Stored in its untyped version
+  *ld = new stf_logical_data_handle_t{ld_typed};
+}
+
+void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
+{
+  assert(ld);
+  ld->ld.set_symbol(symbol);
+}
 
-void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle *ld, void *addr, size_t sz)
+void stf_logical_data_destroy(stf_logical_data_handle ld)
 {
-   assert(ld);
-   assert(ctx);
+  assert(ld);
+  delete ld;
+}
 
-   // Create a slice<char> logical data
-   auto ld_typed = ctx->ctx.logical_data(make_slice((char *)addr, sz));
+void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
+{
+  assert(t);
+  assert(ctx);
 
-   // Stored in its untyped version
-   *ld = new stf_logical_data_handle_t{ld_typed};
+  *t = new stf_task_handle_t{ctx->ctx.task()};
 }
 
-void stf_logical_data_destroy(stf_ctx_handle /* ctx */, stf_logical_data_handle ld)
+void stf_task_set_symbol(stf_task_handle t, const char* symbol)
 {
-    delete ld;
+  assert(t);
+  assert(symbol);
+
+  t->t.set_symbol(symbol);
+}
+
+void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
+{
+  assert(t);
+  assert(ld);
+
+  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
 }
 
+void stf_task_start(stf_task_handle t)
+{
+  assert(t);
+  t->t.start();
+}
+
+void stf_task_end(stf_task_handle t)
+{
+  assert(t);
+  t->t.end();
+}
+
+void stf_task_destroy(stf_task_handle t)
+{
+  assert(t);
+  delete t;
+}
 }
diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp
index 6bbfc3d1e46..86225ad91c7 100644
--- a/c/experimental/stf/test/test_ctx.cpp
+++ b/c/experimental/stf/test/test_ctx.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda_runtime.h>
-#include <c2h/catch2_test_helper.h>
 
+#include <c2h/catch2_test_helper.h>
 #include <cccl/c/experimental/stf/stf.h>
 
 C2H_TEST("basic stf context", "[context]")
diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp
index e91509708fb..168ca8dabbc 100644
--- a/c/experimental/stf/test/test_logical_data.cpp
+++ b/c/experimental/stf/test/test_logical_data.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda_runtime.h>
-#include <c2h/catch2_test_helper.h>
 
+#include <c2h/catch2_test_helper.h>
 #include <cccl/c/experimental/stf/stf.h>
 
 C2H_TEST("basic stf logical_data", "[logical_data]")
@@ -23,14 +23,14 @@ C2H_TEST("basic stf logical_data", "[logical_data]")
   stf_logical_data_handle lA, lB;
 
   float *A, *B;
-  A = (float *)malloc(N*sizeof(float));
-  B = (float *)malloc(N*sizeof(float));
+  A = (float*) malloc(N * sizeof(float));
+  B = (float*) malloc(N * sizeof(float));
 
-  stf_logical_data(ctx, &lA, A, N*sizeof(float));
-  stf_logical_data(ctx, &lB, B, N*sizeof(float));
+  stf_logical_data(ctx, &lA, A, N * sizeof(float));
+  stf_logical_data(ctx, &lB, B, N * sizeof(float));
 
-  stf_logical_data_destroy(ctx, lA);
-  stf_logical_data_destroy(ctx, lB);
+  stf_logical_data_destroy(lA);
+  stf_logical_data_destroy(lB);
 
   stf_ctx_finalize(ctx);
 
diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp
new file mode 100644
index 00000000000..693773e42d3
--- /dev/null
+++ b/c/experimental/stf/test/test_task.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include <c2h/catch2_test_helper.h>
+
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("basic stf task", "[task]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float *)malloc(N*sizeof(float));
+  Y = (float *)malloc(N*sizeof(float));
+  Z = (float *)malloc(N*sizeof(float));
+
+  stf_logical_data(ctx, &lX, X, N*sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N*sizeof(float));
+  stf_logical_data(ctx, &lZ, Z, N*sizeof(float));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  stf_task_add_dep(t4, lZ, STF_RW);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}

From 968995bba0f0a1afd8faafd9c12c3d1f74f32fb5 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 19 Jul 2025 09:41:52 +0200
Subject: [PATCH 022/221] expose start/end and task type in context

---
 .../experimental/__stf/internal/context.cuh    | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 00362b67a1d..eb003679794 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -164,6 +164,7 @@ class context
     ::std::variant<T1, T2> payload;
   };
 
+public:
   /*
    * A task that can be either a stream task or a graph task.
    */
@@ -194,6 +195,22 @@ class context
       return mv(*this);
     }
 
+    auto& start()
+    {
+      payload->*[&](auto& self) {
+        self.start();
+      };
+      return *this;
+    }
+
+    auto& end()
+    {
+      payload->*[&](auto& self) {
+        self.end();
+      };
+      return *this;
+    }
+
     /**
      * @brief Add dependencies to this task.
      *
@@ -238,7 +255,6 @@ class context
     ::std::variant<stream_task<Deps...>, graph_task<Deps...>> payload;
   };
 
-public:
   /**
    * @brief Default constructor for the context class.
    */

From 8cc6a3c6897f778521a9cbe15c24f2a7820b7602 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 00:02:45 +0200
Subject: [PATCH 023/221] Save some WIP

---
 c/experimental/stf/CMakeLists.txt             |  2 +-
 .../stf/include/cccl/c/experimental/stf/stf.h | 22 ++++++++++++++++++-
 c/experimental/stf/src/stf.cu                 |  6 +++++
 c/experimental/stf/test/test_task.cpp         |  2 +-
 4 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 5e8aa8e9f50..9d8c7130c68 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -29,12 +29,12 @@ endif()
 
 find_package(CUDAToolkit REQUIRED)
 set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+target_link_libraries(cccl.c.experimental.stf PUBLIC CCCL::cudax)
 target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cudart_static
   CUDA::nvrtc
   CUDA::nvJitLink
   CUDA::cuda_driver
-  CCCL::cudax
 )
 # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 # target_compile_definitions(cccl.c.experimental.stf PRIVATE
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 50aca91b29d..ab0568d4bf4 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -18,13 +18,20 @@ typedef struct stf_ctx_handle_t* stf_ctx_handle;
 void stf_ctx_create(stf_ctx_handle* ctx);
 void stf_ctx_finalize(stf_ctx_handle ctx);
 
+// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ...
+// TODO stf_ctx_is_graph()
+
+cudaStream_t stf_fence(stf_ctx_handle ctx);
+
 typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
 void stf_logical_data_destroy(stf_logical_data_handle ld);
 
-// TODO token
+// TODO
+// void stf_logical_data_wait(stf_logical_data_handle ld);
+// void stf_token(stf_ctx_handle ctx);
 
 typedef struct stf_task_handle_t* stf_task_handle;
 
@@ -34,8 +41,21 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
 void stf_task_start(stf_task_handle t);
 void stf_task_end(stf_task_handle t);
 cudaStream_t stf_task_get_stream(stf_task_handle t);
+void *stf_task_get(stf_task_handle t, size_t submitted_index);
 void stf_task_destroy(stf_task_handle t);
 
+typedef struct stf_kernel_desc_handle_t *stf_kernel_desc_handle;
+
+void stf_kernel_create(stf_kernel_desc_handle *d);
+void stf_kernel_destroy(stf_kernel_desc_handle d);
+// TODO stf_cuda_kernel_desc : symbol, deps, args... ?
+// void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol)
+// void stf_kernel_add_dep(stf_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
+// void stf_kernel_start(stf_kernel_handle k);
+// void stf_kernel_set_args(stf_kernel_handle k, size_t cnt, void **args);
+// void stf_kernel_end(stf_kernel_handle k);
+// void stf_kernel_destroy(stf_kernel_handle k);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 8b10af298a6..9636d2b1e4a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -36,6 +36,12 @@ void stf_ctx_finalize(stf_ctx_handle ctx)
   delete ctx;
 }
 
+cudaStream_t stf_fence(stf_ctx_handle ctx)
+{
+   assert(ctx);
+   return ctx->ctx.fence();
+}
+
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
 {
   assert(ld);
diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp
index 693773e42d3..4cf11e31846 100644
--- a/c/experimental/stf/test/test_task.cpp
+++ b/c/experimental/stf/test/test_task.cpp
@@ -13,7 +13,7 @@
 
 #include <cccl/c/experimental/stf/stf.h>
 
-C2H_TEST("basic stf task", "[task]")
+C2H_TEST("empty stf tasks", "[task]")
 {
   size_t N = 1000000;
 

From 49064eea864596cdc091815c8ff3ed74450dd2de Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 08:03:43 +0200
Subject: [PATCH 024/221] Save some WIP

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  6 +-
 c/experimental/stf/src/stf.cu                 | 70 ++++++++++++++++++-
 c/experimental/stf/test/test_task.cpp         | 14 ++--
 3 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index ab0568d4bf4..f83bf1e5f40 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -41,12 +41,12 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
 void stf_task_start(stf_task_handle t);
 void stf_task_end(stf_task_handle t);
 cudaStream_t stf_task_get_stream(stf_task_handle t);
-void *stf_task_get(stf_task_handle t, size_t submitted_index);
+void* stf_task_get(stf_task_handle t, size_t submitted_index);
 void stf_task_destroy(stf_task_handle t);
 
-typedef struct stf_kernel_desc_handle_t *stf_kernel_desc_handle;
+typedef struct stf_kernel_desc_handle_t* stf_kernel_desc_handle;
 
-void stf_kernel_create(stf_kernel_desc_handle *d);
+void stf_kernel_create(stf_kernel_desc_handle* d);
 void stf_kernel_destroy(stf_kernel_desc_handle d);
 // TODO stf_cuda_kernel_desc : symbol, deps, args... ?
 // void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol)
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 9636d2b1e4a..9f40efb86dc 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -38,8 +38,8 @@ void stf_ctx_finalize(stf_ctx_handle ctx)
 
 cudaStream_t stf_fence(stf_ctx_handle ctx)
 {
-   assert(ctx);
-   return ctx->ctx.fence();
+  assert(ctx);
+  return ctx->ctx.fence();
 }
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
@@ -107,4 +107,70 @@ void stf_task_destroy(stf_task_handle t)
   assert(t);
   delete t;
 }
+
+/**
+ * Low level example of cuda_kernel(_chain)
+ *   auto t = ctx.cuda_kernel_chain();
+     t.add_deps(lX.read());
+     t.add_deps(lY.rw());
+     t->*[&]() {
+     auto dX = t.template get<slice<double>>(0);
+     auto dY = t.template get<slice<double>>(1);
+     return std::vector<cuda_kernel_desc> {
+         { axpy, 16, 128, 0, alpha, dX, dY },
+         { axpy, 16, 128, 0, beta, dX, dY },
+         { axpy, 16, 128, 0, gamma, dX, dY }
+     };
+  };
+
+ *
+ */
+struct stf_cuda_kernel_handle_t
+{
+  // return type of ctx.cuda_kernel()
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  kernel_type k;
+};
+
+void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
+{
+  assert(k);
+  assert(ctx);
+
+  *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
+}
+
+void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
+{
+  assert(k);
+  assert(symbol);
+
+  k->k.set_symbol(symbol);
+}
+
+void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m)
+{
+  assert(k);
+  assert(ld);
+
+  k->k.add_deps(cuda_kernel_dep_untyped(ld->ld, access_mode(m)));
+}
+
+// void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
+// {
+//   assert(k);
+//   k->k.start();
+// }
+//
+// void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
+// {
+//   assert(k);
+//   k->k.end();
+// }
+
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
+{
+  assert(t);
+  delete t;
+}
 }
diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp
index 4cf11e31846..80266f6b381 100644
--- a/c/experimental/stf/test/test_task.cpp
+++ b/c/experimental/stf/test/test_task.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda_runtime.h>
-#include <c2h/catch2_test_helper.h>
 
+#include <c2h/catch2_test_helper.h>
 #include <cccl/c/experimental/stf/stf.h>
 
 C2H_TEST("empty stf tasks", "[task]")
@@ -23,13 +23,13 @@ C2H_TEST("empty stf tasks", "[task]")
   stf_logical_data_handle lX, lY, lZ;
 
   float *X, *Y, *Z;
-  X = (float *)malloc(N*sizeof(float));
-  Y = (float *)malloc(N*sizeof(float));
-  Z = (float *)malloc(N*sizeof(float));
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
 
-  stf_logical_data(ctx, &lX, X, N*sizeof(float));
-  stf_logical_data(ctx, &lY, Y, N*sizeof(float));
-  stf_logical_data(ctx, &lZ, Z, N*sizeof(float));
+  stf_logical_data(ctx, &lX, X, N * sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+  stf_logical_data(ctx, &lZ, Z, N * sizeof(float));
 
   stf_logical_data_set_symbol(lX, "X");
   stf_logical_data_set_symbol(lY, "Y");

From f06f72e6a70e9814a85732090851abfe99d495f3 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 20 Jul 2025 10:31:35 +0200
Subject: [PATCH 025/221] Start to experiment with bindings for cuda_kernel

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 19 +++---
 c/experimental/stf/src/stf.cu                 | 28 ++++----
 c/experimental/stf/test/test_cuda_kernel.cpp  | 65 +++++++++++++++++++
 3 files changed, 87 insertions(+), 25 deletions(-)
 create mode 100644 c/experimental/stf/test/test_cuda_kernel.cpp

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index f83bf1e5f40..5e4b9ac05ff 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -44,17 +44,14 @@ cudaStream_t stf_task_get_stream(stf_task_handle t);
 void* stf_task_get(stf_task_handle t, size_t submitted_index);
 void stf_task_destroy(stf_task_handle t);
 
-typedef struct stf_kernel_desc_handle_t* stf_kernel_desc_handle;
-
-void stf_kernel_create(stf_kernel_desc_handle* d);
-void stf_kernel_destroy(stf_kernel_desc_handle d);
-// TODO stf_cuda_kernel_desc : symbol, deps, args... ?
-// void stf_kernel_set_symbol((stf_kernel_handle k, const char* symbol)
-// void stf_kernel_add_dep(stf_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
-// void stf_kernel_start(stf_kernel_handle k);
-// void stf_kernel_set_args(stf_kernel_handle k, size_t cnt, void **args);
-// void stf_kernel_end(stf_kernel_handle k);
-// void stf_kernel_destroy(stf_kernel_handle k);
+typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
+
+void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
+void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
+void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
+void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
+void stf_cuda_kernel_end(stf_cuda_kernel_handle k);
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t);
 
 #ifdef __cplusplus
 }
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 9f40efb86dc..c897afaa66a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -153,20 +153,20 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l
   assert(k);
   assert(ld);
 
-  k->k.add_deps(cuda_kernel_dep_untyped(ld->ld, access_mode(m)));
-}
-
-// void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
-// {
-//   assert(k);
-//   k->k.start();
-// }
-//
-// void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
-// {
-//   assert(k);
-//   k->k.end();
-// }
+  k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+}
+
+void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
+{
+  assert(k);
+  k->k.start();
+}
+
+void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
+{
+  assert(k);
+  k->k.end();
+}
 
 void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
 {
diff --git a/c/experimental/stf/test/test_cuda_kernel.cpp b/c/experimental/stf/test/test_cuda_kernel.cpp
new file mode 100644
index 00000000000..3cb3606fd3a
--- /dev/null
+++ b/c/experimental/stf/test/test_cuda_kernel.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(int cnt, double a, const double *x, double *y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < cnt; i += nthreads)
+  {
+    y[i] += a * x[i];
+  }
+}
+
+C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY;
+
+  float *X, *Y;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+
+  stf_logical_data(ctx, &lX, X, N * sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+
+  stf_cuda_kernel_handle k;
+  stf_cuda_kernel_create(ctx, &k);
+  stf_cuda_kernel_set_symbol(k, "axpy");
+  stf_cuda_kernel_add_dep(k, lX, STF_READ);
+  stf_cuda_kernel_add_dep(k, lY, STF_RW);
+  stf_cuda_kernel_start(k);
+  // TODO add descs
+  stf_cuda_kernel_end(k);
+  stf_cuda_kernel_destroy(k);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+}

From 08a22c9d08803ac4f1eef13de13f9d0374d87979 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 11:48:40 +0200
Subject: [PATCH 026/221] Save WIP: we cannot directly use the pointer to a
 global function and pass it to a shared library, so we convert it to a
 CUfunction prior to calling a function in the shared library (so we do it in
 the header)

---
 c/experimental/stf/CMakeLists.txt             |  2 +-
 .../stf/include/cccl/c/experimental/stf/stf.h | 31 ++++++++++++++
 c/experimental/stf/src/stf.cu                 | 42 +++++++++++++++++++
 ...st_cuda_kernel.cpp => test_cuda_kernel.cu} | 21 ++++++----
 4 files changed, 88 insertions(+), 8 deletions(-)
 rename c/experimental/stf/test/{test_cuda_kernel.cpp => test_cuda_kernel.cu} (82%)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 9d8c7130c68..5e8aa8e9f50 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -29,12 +29,12 @@ endif()
 
 find_package(CUDAToolkit REQUIRED)
 set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
-target_link_libraries(cccl.c.experimental.stf PUBLIC CCCL::cudax)
 target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cudart_static
   CUDA::nvrtc
   CUDA::nvJitLink
   CUDA::cuda_driver
+  CCCL::cudax
 )
 # target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 # target_compile_definitions(cccl.c.experimental.stf PRIVATE
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 5e4b9ac05ff..3f118af4685 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1,3 +1,5 @@
+#include <assert.h>
+#include <cuda.h>
 #include <cuda_runtime.h>
 
 // TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
@@ -50,6 +52,35 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
+
+void stf_cuda_kernel_add_desc_cufunc(
+  stf_cuda_kernel_handle k,
+  CUfunction cufunc,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args);
+
+/* Convert CUDA kernel address to CUfunction because we may use them from a
+ * shared library where this would be invalid in the runtime API. */
+static inline void stf_cuda_kernel_add_desc(
+  stf_cuda_kernel_handle k,
+  const void* func,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args)
+{
+  CUfunction cufunc;
+  cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
+  assert(res == cudaSuccess);
+
+  stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+}
+
+void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index);
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k);
 void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t);
 
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index c897afaa66a..63eb470b1ef 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -162,6 +162,48 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
   k->k.start();
 }
 
+#if 0
+//
+//  template <typename Fun>
+//  void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args)
+void stf_cuda_kernel_add_desc(stf_cuda_kernel_handle k, const void *func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args)
+{
+    /* We convert the function to a CUfunction because this code is a shared
+     * library which cannot launch kernels using cudaLaunchKernel directly, or we
+     * will get invalid device function. */
+    //CUfunction cufunc;
+    //cudaGetFuncBySymbol(&cufunc, (void *)func);
+    CUkernel cukernel;
+    cudaGetKernel(&cukernel, (void *)func);
+
+    cuda_kernel_desc desc;
+    desc.configure_raw(cukernel, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+
+    k->k.add_kernel_desc(mv(desc));
+}
+#endif
+
+void stf_cuda_kernel_add_desc_cufunc(
+  stf_cuda_kernel_handle k,
+  CUfunction cufunc,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args)
+{
+  cuda_kernel_desc desc;
+  desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+
+  k->k.add_kernel_desc(mv(desc));
+}
+
+void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index)
+{
+  auto s = k->k.template get<slice<const char>>(index);
+  return (void*) s.data_handle();
+}
+
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
 {
   assert(k);
diff --git a/c/experimental/stf/test/test_cuda_kernel.cpp b/c/experimental/stf/test/test_cuda_kernel.cu
similarity index 82%
rename from c/experimental/stf/test/test_cuda_kernel.cpp
rename to c/experimental/stf/test/test_cuda_kernel.cu
index 3cb3606fd3a..8ba6c0e90da 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cpp
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -13,17 +13,22 @@
 #include <c2h/catch2_test_helper.h>
 #include <cccl/c/experimental/stf/stf.h>
 
-using namespace cuda::experimental::stf;
-
+#if 0
 __global__ void axpy(int cnt, double a, const double *x, double *y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
 
-  for (int i = tid; i < cnt; i += nthreads)
-  {
-    y[i] += a * x[i];
-  }
+//  for (int i = tid; i < cnt; i += nthreads)
+//  {
+//    y[i] += a * x[i];
+//  }
+}
+#endif
+
+extern "C" __global__ void axpy(int, double, const double*, double*)
+{
+  printf("hello.\n");
 }
 
 C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
@@ -51,7 +56,9 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   stf_cuda_kernel_add_dep(k, lX, STF_READ);
   stf_cuda_kernel_add_dep(k, lY, STF_RW);
   stf_cuda_kernel_start(k);
-  // TODO add descs
+  void* dummy         = nullptr;
+  const void* args[4] = {&N, &alpha, &dummy, &dummy};
+  stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
   stf_cuda_kernel_end(k);
   stf_cuda_kernel_destroy(k);
 

From 1482bbc3a9d9311dc3212336acc8c6ff618b8fcf Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 11:55:16 +0200
Subject: [PATCH 027/221] add missing variable

---
 c/experimental/stf/test/test_cuda_kernel.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index 8ba6c0e90da..90eb7e668c5 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -44,6 +44,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   X = (float*) malloc(N * sizeof(float));
   Y = (float*) malloc(N * sizeof(float));
 
+  const double alpha = 3.14;
+
   stf_logical_data(ctx, &lX, X, N * sizeof(float));
   stf_logical_data(ctx, &lY, Y, N * sizeof(float));
 

From f45d8dabcb422668807971edf68715809e291f66 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 11:58:07 +0200
Subject: [PATCH 028/221] Add missing finalize() call

---
 c/experimental/stf/src/stf.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 63eb470b1ef..70e9a972b2f 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -32,6 +32,7 @@ void stf_ctx_create(stf_ctx_handle* ctx)
 
 void stf_ctx_finalize(stf_ctx_handle ctx)
 {
+  ctx->ctx.finalize();
   assert(ctx);
   delete ctx;
 }

From 8b03c29b9e272aedcea89df93c700c7e9aa6042c Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 14:46:20 +0200
Subject: [PATCH 029/221] axpy example works with cuda_kernel in C

---
 c/experimental/stf/test/test_cuda_kernel.cu | 32 +++++++++------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index 90eb7e668c5..e8494a1d95b 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -13,22 +13,15 @@
 #include <c2h/catch2_test_helper.h>
 #include <cccl/c/experimental/stf/stf.h>
 
-#if 0
-__global__ void axpy(int cnt, double a, const double *x, double *y)
+__global__ void axpy(int cnt, double a, const double* x, double* y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
 
-//  for (int i = tid; i < cnt; i += nthreads)
-//  {
-//    y[i] += a * x[i];
-//  }
-}
-#endif
-
-extern "C" __global__ void axpy(int, double, const double*, double*)
-{
-  printf("hello.\n");
+  for (int i = tid; i < cnt; i += nthreads)
+  {
+    y[i] += a * x[i];
+  }
 }
 
 C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
@@ -40,14 +33,14 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
 
   stf_logical_data_handle lX, lY;
 
-  float *X, *Y;
-  X = (float*) malloc(N * sizeof(float));
-  Y = (float*) malloc(N * sizeof(float));
+  double *X, *Y;
+  X = (double*) malloc(N * sizeof(double));
+  Y = (double*) malloc(N * sizeof(double));
 
   const double alpha = 3.14;
 
-  stf_logical_data(ctx, &lX, X, N * sizeof(float));
-  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+  stf_logical_data(ctx, &lX, X, N * sizeof(double));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(double));
 
   stf_logical_data_set_symbol(lX, "X");
   stf_logical_data_set_symbol(lY, "Y");
@@ -58,8 +51,9 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   stf_cuda_kernel_add_dep(k, lX, STF_READ);
   stf_cuda_kernel_add_dep(k, lY, STF_RW);
   stf_cuda_kernel_start(k);
-  void* dummy         = nullptr;
-  const void* args[4] = {&N, &alpha, &dummy, &dummy};
+  double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
+  double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
+  const void* args[4] = {&N, &alpha, &dX, &dY};
   stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
   stf_cuda_kernel_end(k);
   stf_cuda_kernel_destroy(k);

From 749ca3b42f1db6d411ed641faa12e9451ca8c0f8 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 18:21:47 +0200
Subject: [PATCH 030/221] check result

---
 c/experimental/stf/test/test_cuda_kernel.cu | 22 +++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index e8494a1d95b..b5ba66b0f3a 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -24,6 +24,16 @@ __global__ void axpy(int cnt, double a, const double* x, double* y)
   }
 }
 
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
 C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
 {
   size_t N = 1000000;
@@ -37,6 +47,12 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   X = (double*) malloc(N * sizeof(double));
   Y = (double*) malloc(N * sizeof(double));
 
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
   const double alpha = 3.14;
 
   stf_logical_data(ctx, &lX, X, N * sizeof(double));
@@ -63,6 +79,12 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
 
   stf_ctx_finalize(ctx);
 
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+
   free(X);
   free(Y);
 }

From 4fb70006073558ae9eedb695efa2a505cb438b7a Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Mon, 21 Jul 2025 18:42:39 +0200
Subject: [PATCH 031/221] Add ctx_token

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  3 +-
 c/experimental/stf/src/stf.cu                 |  8 ++
 c/experimental/stf/test/test_token.cpp        | 78 +++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 c/experimental/stf/test/test_token.cpp

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 3f118af4685..359425aadfb 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -33,7 +33,8 @@ void stf_logical_data_destroy(stf_logical_data_handle ld);
 
 // TODO
 // void stf_logical_data_wait(stf_logical_data_handle ld);
-// void stf_token(stf_ctx_handle ctx);
+
+void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 
 typedef struct stf_task_handle_t* stf_task_handle;
 
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 70e9a972b2f..df474582d21 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -67,6 +67,14 @@ void stf_logical_data_destroy(stf_logical_data_handle ld)
   delete ld;
 }
 
+void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
+{
+  assert(ctx);
+  assert(ld);
+
+  *ld = new stf_logical_data_handle_t{ctx->ctx.token()};
+}
+
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
 {
   assert(t);
diff --git a/c/experimental/stf/test/test_token.cpp b/c/experimental/stf/test/test_token.cpp
new file mode 100644
index 00000000000..ccd7f0a9e2c
--- /dev/null
+++ b/c/experimental/stf/test/test_token.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("stf token", "[token]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
+
+  stf_token(ctx, &lX);
+  stf_token(ctx, &lY);
+  stf_token(ctx, &lZ);
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  stf_task_add_dep(t4, lZ, STF_RW);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}

From 94757f84107aedbf2d8c0f43248e7e2f7ace8286 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <cedric.augonnet@gmail.com>
Date: Tue, 22 Jul 2025 16:52:37 +0200
Subject: [PATCH 032/221] make cudax usable in the python dir

---
 python/cuda_cccl/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 5fe0ba6d692..7736023692d 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -22,6 +22,7 @@ cccl_build_compiler_targets()
 
 # Build and install C++ library first
 set(CCCL_ENABLE_C ON)
+set(CCCL_ENABLE_UNSTABLE ON)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 add_subdirectory(${_cccl_root} _parent_cccl)
 
@@ -29,6 +30,7 @@ add_subdirectory(${_cccl_root} _parent_cccl)
 find_package(CUB REQUIRED)
 find_package(Thrust REQUIRED)
 find_package(libcudacxx REQUIRED)
+find_package(cudax REQUIRED)
 
 # Install headers
 set(_dest_incl_dir cuda/cccl/headers/include)

From 189e0832acdf8a0939c8041f0e7b52b876dcf500 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Wed, 23 Jul 2025 11:30:24 +0200
Subject: [PATCH 033/221] Save WIP with python

---
 python/cuda_cccl/CMakeLists.txt               | 23 +++++++++++
 .../experimental/stf/_stf_bindings_impl.pyx   | 38 +++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 7736023692d..b1f693078b1 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -114,3 +114,26 @@ target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver)
 set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
 
 install(TARGETS _bindings_impl DESTINATION cuda/cccl/parallel/experimental)
+
+message(STATUS "STF Using Cython ${CYTHON_VERSION}")
+set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx")
+set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c")
+set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep")
+add_custom_command(
+    OUTPUT "${_stf_generated_extension_src}"
+    COMMAND "${Python3_EXECUTABLE}" -m cython
+    ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src}
+    DEPENDS "${stf_pyx_source_file}"
+    DEPFILE "${_stf_depfile}"
+)
+set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE)
+add_custom_target(cythonize_stf_bindings_impl ALL
+    DEPENDS "${_stf_generated_extension_src}"
+)
+
+Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}")
+add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl)
+target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver)
+set_target_properties(_stf_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
+
+install(TARGETS _stf_bindings_impl DESTINATION cuda/cccl/experimental/stf)
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
new file mode 100644
index 00000000000..913016c2b35
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -0,0 +1,38 @@
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+# Python signatures are declared in the companion Python stub file _bindings.pyi
+# Make sure to update PYI with change to Python API to ensure that Python
+# static type checker tools like mypy green-lights cuda.cccl.parallel
+
+from libc.string cimport memset, memcpy
+from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
+from cpython.bytes cimport PyBytes_FromStringAndSize
+
+from cpython.buffer cimport (
+    Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
+    PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer
+)
+from cpython.pycapsule cimport (
+    PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer
+)
+
+import ctypes
+
+cdef extern from "<cuda.h>":
+    cdef struct OpaqueCUstream_st
+    cdef struct OpaqueCUkernel_st
+    cdef struct OpaqueCUlibrary_st
+
+    ctypedef int CUresult
+    ctypedef OpaqueCUstream_st *CUstream
+    ctypedef OpaqueCUkernel_st *CUkernel
+    ctypedef OpaqueCUlibrary_st *CUlibrary
+
+cdef extern from "cccl/c/experimental/stf/stf.h":
+    ctypedef struct stf_ctx_handle_t
+    ctypedef stf_ctx_handle_t* stf_ctx_handle
+
+    void stf_ctx_create(stf_ctx_handle* ctx)
+    void stf_ctx_finalize(stf_ctx_handle ctx)

From 3ed26acdccca4c9caa4f4660fa41b2b562b440d2 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Wed, 23 Jul 2025 17:27:52 +0200
Subject: [PATCH 034/221] fix a typo

---
 c/experimental/stf/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 5e8aa8e9f50..13abde52a2d 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -21,7 +21,7 @@ cccl_configure_target(cccl.c.experimental.stf DIALECT 17)
 
 # Override the properties set by cccl_configure_target:
 if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
-  set_target_properties(cccl.c.parallel PROPERTIES
+  set_target_properties(cccl.c.experimental.stf PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
     ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
   )

From 1a5038ad2cd650acbc1bd52881885dfdb9061493 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Wed, 23 Jul 2025 17:33:59 +0200
Subject: [PATCH 035/221] fixed in python for stf

---
 python/cuda_cccl/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index b1f693078b1..ff7c2c8f8ca 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -24,6 +24,7 @@ cccl_build_compiler_targets()
 set(CCCL_ENABLE_C ON)
 set(CCCL_ENABLE_UNSTABLE ON)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
+set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 add_subdirectory(${_cccl_root} _parent_cccl)
 
 # Now we can find CUB and other components

From 0c1a2ae5a306cec47a1f46117ea1bdeabc1531da Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Wed, 23 Jul 2025 18:14:20 +0200
Subject: [PATCH 036/221] Add a minimalistic Ctx class

---
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 913016c2b35..53775e61df3 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -36,3 +36,14 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
 
     void stf_ctx_create(stf_ctx_handle* ctx)
     void stf_ctx_finalize(stf_ctx_handle ctx)
+
+cdef class Ctx:
+    cdef stf_ctx_handle _ctx
+
+    def __cinit__(self):
+        stf_ctx_create(&self._ctx)
+
+    def __dealloc__(self):
+        if self._ctx != NULL:
+            stf_ctx_finalize(self._ctx)
+            self._ctx = NULL

From 2f519e8dce30a0388cb7dd9e3d14c74b5e05964d Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Wed, 23 Jul 2025 19:06:12 +0200
Subject: [PATCH 037/221] Fix installation paths

---
 c/experimental/stf/CMakeLists.txt | 2 +-
 python/cuda_cccl/CMakeLists.txt   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 13abde52a2d..f151e8bf766 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -7,7 +7,7 @@ option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tes
 # FIXME Ideally this would be handled by presets and install rules, but for now
 # consumers may override this to control the target location of cccl.c.experimental.stf.
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library")
-mark_as_advanced(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY)
+mark_as_advanced(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
 
 file(GLOB_RECURSE srcs
   RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index ff7c2c8f8ca..f3fc06163e2 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -58,6 +58,11 @@ install(
     DESTINATION cuda/cccl/parallel/experimental/cccl
 )
 
+install(
+    TARGETS cccl.c.experimental.stf
+    DESTINATION cuda/cccl/experimental/stf/cccl
+)
+
 # Build and install Cython extension
 find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
 

From 469ff3352482f9cf2259b5067fb0a3f6b1a7f920 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 00:50:45 +0200
Subject: [PATCH 038/221] Add a dummy STF test

---
 python/cuda_cccl/tests/stf/test_context.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_context.py

diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
new file mode 100644
index 00000000000..b97922488fd
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cccl.experimental.stf import _stf_bindings_impl
+
+def test_ctx():
+    ctx = _stf_bindings_impl.Ctx()
+    del ctx

From c2a8fde9513b021c67d056901bd1f1843b8fdba2 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 08:12:40 +0200
Subject: [PATCH 039/221] logical_data bindings

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 49 +++++++++++++++++--
 python/cuda_cccl/tests/stf/test_context.py    | 15 +++++-
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 53775e61df3..2bad6f936b8 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -6,9 +6,11 @@
 # Make sure to update PYI with change to Python API to ensure that Python
 # static type checker tools like mypy green-lights cuda.cccl.parallel
 
-from libc.string cimport memset, memcpy
-from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
+from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release
 from cpython.bytes cimport PyBytes_FromStringAndSize
+from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
+from libc.stdint cimport uintptr_t
+from libc.string cimport memset, memcpy
 
 from cpython.buffer cimport (
     Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
@@ -33,11 +35,39 @@ cdef extern from "<cuda.h>":
 cdef extern from "cccl/c/experimental/stf/stf.h":
     ctypedef struct stf_ctx_handle_t
     ctypedef stf_ctx_handle_t* stf_ctx_handle
-
     void stf_ctx_create(stf_ctx_handle* ctx)
     void stf_ctx_finalize(stf_ctx_handle ctx)
 
-cdef class Ctx:
+    ctypedef struct stf_logical_data_handle_t
+    ctypedef stf_logical_data_handle_t* stf_logical_data_handle
+    void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
+    void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
+    void stf_logical_data_destroy(stf_logical_data_handle ld)
+
+# Python-visible class
+cdef class logical_data:
+    cdef stf_logical_data_handle _ld
+
+    def __cinit__(self, context ctx, object buf):
+        cdef Py_buffer view
+        if PyObject_GetBuffer(buf, &view, PyBUF_SIMPLE) != 0:
+            raise ValueError("object doesn’t support the buffer protocol")
+
+        try:
+            stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len)
+
+        finally:
+            PyBuffer_Release(&view)
+
+    def set_symbol(self, str name):
+        stf_logical_data_set_symbol(self._ld, name.encode())
+
+    def __dealloc__(self):
+        if self._ld != NULL:
+            stf_logical_data_destroy(self._ld)
+            self._ld = NULL
+
+cdef class context:
     cdef stf_ctx_handle _ctx
 
     def __cinit__(self):
@@ -47,3 +77,14 @@ cdef class Ctx:
         if self._ctx != NULL:
             stf_ctx_finalize(self._ctx)
             self._ctx = NULL
+
+    def logical_data(self, object buf):
+        """
+        Create and return a `logical_data` object bound to this context.
+
+        Parameters
+        ----------
+        buf : any buffer‑supporting Python object
+              (NumPy array, bytes, bytearray, memoryview, …)
+        """
+        return logical_data(self, buf)
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index b97922488fd..df0c75d58fb 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -2,8 +2,19 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from cuda.cccl.experimental.stf import _stf_bindings_impl
+from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context
+import ctypes
+import numpy as np
 
 def test_ctx():
-    ctx = _stf_bindings_impl.Ctx()
+    ctx = _stf_bindings_impl.context()
+    del ctx
+
+def test_ctx2():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+
+    ctx = _stf_bindings_impl.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
     del ctx

From 3a39aa535a074e3f4a88fc7d2183fac41dd03b19 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 09:51:13 +0200
Subject: [PATCH 040/221] deps

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 47 ++++++++++++++++++-
 python/cuda_cccl/tests/stf/test_context.py    | 15 +++++-
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 2bad6f936b8..3c0606f6d6d 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -21,6 +21,7 @@ from cpython.pycapsule cimport (
 )
 
 import ctypes
+from enum import IntFlag
 
 cdef extern from "<cuda.h>":
     cdef struct OpaqueCUstream_st
@@ -44,7 +45,29 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
     void stf_logical_data_destroy(stf_logical_data_handle ld)
 
-# Python-visible class
+    ctypedef struct stf_task_handle_t
+    ctypedef stf_task_handle_t* stf_task_handle
+    void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
+    void stf_task_set_symbol(stf_task_handle t, const char* symbol)
+    void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
+    void stf_task_start(stf_task_handle t)
+    void stf_task_end(stf_task_handle t)
+    # cudaStream_t stf_task_get_stream(stf_task_handle t)
+    void* stf_task_get(stf_task_handle t, size_t submitted_index)
+    void stf_task_destroy(stf_task_handle t)
+
+    cdef enum stf_access_mode:
+        STF_NONE
+        STF_READ
+        STF_WRITE
+        STF_RW
+
+class AccessMode(IntFlag):
+    NONE  = STF_NONE
+    READ  = STF_READ
+    WRITE = STF_WRITE
+    RW    = STF_RW
+
 cdef class logical_data:
     cdef stf_logical_data_handle _ld
 
@@ -67,6 +90,25 @@ cdef class logical_data:
             stf_logical_data_destroy(self._ld)
             self._ld = NULL
 
+cdef class task:
+    cdef stf_task_handle _t
+
+    def __cinit__(self, context ctx):
+        stf_task_create(ctx._ctx, &self._t)
+
+    def __dealloc__(self):
+        if self._t != NULL:
+             stf_task_destroy(self._t)
+
+    def start(self):
+        stf_task_start(self._t)
+
+    def end(self):
+        stf_task_end(self._t)
+
+    def add_dep(self, logical_data ld, int mode):
+        stf_task_add_dep(self._t, ld._ld, <stf_access_mode> mode)
+
 cdef class context:
     cdef stf_ctx_handle _ctx
 
@@ -88,3 +130,6 @@ cdef class context:
               (NumPy array, bytes, bytearray, memoryview, …)
         """
         return logical_data(self, buf)
+
+    def task(self):
+        return task(self)
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index df0c75d58fb..6c274298922 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context
+from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode
 import ctypes
 import numpy as np
 
@@ -14,7 +14,18 @@ def test_ctx2():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
 
-    ctx = _stf_bindings_impl.context()
+    ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
+
+    t = ctx.task()
+    t.add_dep(lX, AccessMode.READ.value)
+    t.add_dep(lY, AccessMode.RW.value)
+    t.start()
+    t.end()
+
     del ctx
+
+if __name__ == "__main__":
+    print("Running CUDASTF examples...")
+    test_ctx2()

From 4491ff37232178afc63a92c3c36b489ec16e179b Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 10:49:41 +0200
Subject: [PATCH 041/221] better task api

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 46 +++++++++++++++++--
 python/cuda_cccl/tests/stf/test_context.py    | 11 +++--
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 3c0606f6d6d..7f600ff50d0 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -90,6 +90,22 @@ cdef class logical_data:
             stf_logical_data_destroy(self._ld)
             self._ld = NULL
 
+class dep:
+    __slots__ = ("ld", "mode")
+    def __init__(self, logical_data ld, int mode):
+        self.ld   = ld
+        self.mode = mode
+    def __iter__(self):      # nice unpacking support
+        yield self.ld
+        yield self.mode
+    def __repr__(self):
+        return f"dep({self.ld!r}, {self.mode})"
+
+# optional sugar
+def read(ld):   return dep(ld, AccessMode.READ.value)
+def write(ld):  return dep(ld, AccessMode.WRITE.value)
+def rw(ld):     return dep(ld, AccessMode.RW.value)
+
 cdef class task:
     cdef stf_task_handle _t
 
@@ -106,8 +122,18 @@ cdef class task:
     def end(self):
         stf_task_end(self._t)
 
-    def add_dep(self, logical_data ld, int mode):
-        stf_task_add_dep(self._t, ld._ld, <stf_access_mode> mode)
+    def add_dep(self, object d):
+        """
+        Accept a `dep` instance created with read(ld), write(ld), or rw(ld).
+        """
+        if not isinstance(d, dep):
+            raise TypeError("add_dep expects read(ld), write(ld) or rw(ld)")
+
+        cdef logical_data ldata = <logical_data> d.ld
+        cdef int           mode_int  = int(d.mode)
+        cdef stf_access_mode mode_ce = <stf_access_mode> mode_int
+
+        stf_task_add_dep(self._t, ldata._ld, mode_ce)
 
 cdef class context:
     cdef stf_ctx_handle _ctx
@@ -131,5 +157,17 @@ cdef class context:
         """
         return logical_data(self, buf)
 
-    def task(self):
-        return task(self)
+    def task(self, *deps):
+        """
+        Create a `task`
+
+        Example
+        -------
+        >>> t = ctx.task(read(lX), rw(lY))
+        >>> t.start()
+        >>> t.end()
+        """
+        t = task(self)          # construct with this context
+        for d in deps:
+            t.add_dep(d)        # your existing add_dep logic
+        return t
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index 6c274298922..fe5d5d753bc 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode
+from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write
 import ctypes
 import numpy as np
 
@@ -18,12 +18,15 @@ def test_ctx2():
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
 
-    t = ctx.task()
-    t.add_dep(lX, AccessMode.READ.value)
-    t.add_dep(lY, AccessMode.RW.value)
+    t = ctx.task(read(lX), rw(lY))
     t.start()
     t.end()
 
+    t2 = ctx.task()
+    t2.add_dep(rw(lX))
+    t2.start()
+    t2.end()
+
     del ctx
 
 if __name__ == "__main__":

From 74b430ce4b1e2011a6edcb2cadbe3d5cc04cd1e5 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 11:21:24 +0200
Subject: [PATCH 042/221] test with context managers

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 12 ++++++
 python/cuda_cccl/tests/stf/test_context.py    | 41 +++++++++++++++++--
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 7f600ff50d0..a1f94b955cf 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -135,6 +135,18 @@ cdef class task:
 
         stf_task_add_dep(self._t, ldata._ld, mode_ce)
 
+    # ---- context‑manager helpers -------------------------------
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, object exc_type, object exc, object tb):
+        """
+        Always called, even if an exception occurred inside the block.
+        """
+        self.end()
+        return False
+
 cdef class context:
     cdef stf_ctx_handle _ctx
 
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index fe5d5d753bc..8fbda4e4105 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -13,22 +13,55 @@ def test_ctx():
 def test_ctx2():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
 
     ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Y)
 
-    t = ctx.task(read(lX), rw(lY))
+    t = ctx.task(rw(lX))
     t.start()
     t.end()
 
-    t2 = ctx.task()
-    t2.add_dep(rw(lX))
+    t2 = ctx.task(read(lX), rw(lY))
     t2.start()
     t2.end()
 
+    t3 = ctx.task(read(lX), rw(lZ))
+    t3.start()
+    t3.end()
+
+    t4 = ctx.task(read(lY), rw(lZ))
+    t4.start()
+    t4.end()
+
+    del ctx
+
+def test_ctx3():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Y)
+
+    with ctx.task(rw(lX)):
+        pass
+
+    with ctx.task(read(lX), rw(lY)):
+        pass
+
+    with ctx.task(read(lX), rw(lZ)):
+        pass
+
+    with ctx.task(read(lY), rw(lZ)):
+        pass
+
     del ctx
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    test_ctx2()
+    test_ctx3()

From 479c24b9180354fa97e60e7cfde203313c384412 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 12:49:19 +0200
Subject: [PATCH 043/221] context task get_stream

---
 .../experimental/__stf/internal/context.cuh   | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index eb003679794..ff6b69859b4 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -251,6 +251,16 @@ public:
       };
     }
 
+    cudaStream_t get_stream() const
+    {
+      if (auto p = ::std::get_if<stream_task<Deps...>>(&payload))
+      {
+        return p->get_stream();
+      }
+
+      return nullptr;
+    }
+
   private:
     ::std::variant<stream_task<Deps...>, graph_task<Deps...>> payload;
   };
@@ -1501,6 +1511,32 @@ UNITTEST("token vector")
   ctx.finalize();
 };
 
+UNITTEST("get_stream")
+{
+  context ctx;
+
+  auto token = ctx.token();
+  auto t     = ctx.task(token.write());
+  t.start();
+  cudaStream_t s = t.get_stream();
+  EXPECT(s != nullptr);
+  t.end();
+  ctx.finalize();
+};
+
+UNITTEST("get_stream graph")
+{
+  context ctx = graph_ctx();
+
+  auto token = ctx.token();
+  auto t     = ctx.task(token.write());
+  t.start();
+  cudaStream_t s = t.get_stream();
+  EXPECT(s == nullptr);
+  t.end();
+  ctx.finalize();
+};
+
 #endif // UNITTESTED_FILE
 
 } // end namespace cuda::experimental::stf

From d4286511e771b2a9d131f4ef18b307560b44d321 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 12:49:34 +0200
Subject: [PATCH 044/221] Fix python examples

---
 python/cuda_cccl/tests/stf/test_context.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index 8fbda4e4105..79f443ac2cd 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -2,14 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write
-import ctypes
 import numpy as np
 
+from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw
+
+
 def test_ctx():
-    ctx = _stf_bindings_impl.context()
+    ctx = context()
     del ctx
 
+
 def test_ctx2():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
@@ -18,7 +20,7 @@ def test_ctx2():
     ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
-    lZ = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
 
     t = ctx.task(rw(lX))
     t.start()
@@ -38,6 +40,7 @@ def test_ctx2():
 
     del ctx
 
+
 def test_ctx3():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
@@ -46,7 +49,7 @@ def test_ctx3():
     ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
-    lZ = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
 
     with ctx.task(rw(lX)):
         pass
@@ -62,6 +65,7 @@ def test_ctx3():
 
     del ctx
 
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_ctx3()

From f7c74628e0d45e915e54b269786c77bbf746ddc0 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 12:59:05 +0200
Subject: [PATCH 045/221] fix unused var

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 359425aadfb..29a882d75e0 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -75,7 +75,7 @@ static inline void stf_cuda_kernel_add_desc(
   const void** args)
 {
   CUfunction cufunc;
-  cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
+  [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
   assert(res == cudaSuccess);
 
   stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);

From 17a31b9b1d5c82c4ca65b81f9c39cbd3488c5517 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 15:26:40 +0200
Subject: [PATCH 046/221] Add const qualifiers

---
 cudax/include/cuda/experimental/__stf/stream/stream_task.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index b8143ac57d7..e5552721813 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -73,7 +73,7 @@ public:
   // Returns the stream associated to that task : any asynchronous operation
   // in the task body should be performed asynchronously with respect to that
   // CUDA stream
-  cudaStream_t get_stream()
+  cudaStream_t get_stream() const
   {
     const auto& e_place = get_exec_place();
     if (e_place.is_grid())
@@ -89,7 +89,7 @@ public:
   }
 
   // TODO use a pos4 and check that we have a grid, of the proper dimension
-  cudaStream_t get_stream(size_t pos)
+  cudaStream_t get_stream(size_t pos) const
   {
     const auto& e_place = get_exec_place();
 

From aaf503f0b7929a4aa51e9b1f23c608dd0770e108 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 23:33:51 +0200
Subject: [PATCH 047/221] NUMBA interop

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  4 +-
 c/experimental/stf/src/stf.cu                 | 13 +++
 .../experimental/stf/_stf_bindings_impl.pyx   | 79 ++++++++++++++++++-
 python/cuda_cccl/tests/stf/test_numba.py      | 66 ++++++++++++++++
 4 files changed, 157 insertions(+), 5 deletions(-)
 create mode 100644 python/cuda_cccl/tests/stf/test_numba.py

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 29a882d75e0..639b02b503f 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -43,8 +43,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol);
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
 void stf_task_start(stf_task_handle t);
 void stf_task_end(stf_task_handle t);
-cudaStream_t stf_task_get_stream(stf_task_handle t);
-void* stf_task_get(stf_task_handle t, size_t submitted_index);
+CUstream stf_task_get_custream(stf_task_handle t);
+void* stf_task_get(stf_task_handle t, int submitted_index);
 void stf_task_destroy(stf_task_handle t);
 
 typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index df474582d21..2d31ea6907a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -99,6 +99,13 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
   t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
 }
 
+void* stf_task_get(stf_task_handle t, int index)
+{
+  assert(t);
+  auto s = t->t.template get<slice<const char>>(index);
+  return (void*) s.data_handle();
+}
+
 void stf_task_start(stf_task_handle t)
 {
   assert(t);
@@ -111,6 +118,12 @@ void stf_task_end(stf_task_handle t)
   t->t.end();
 }
 
+CUstream stf_task_get_custream(stf_task_handle t)
+{
+  assert(t);
+  return (CUstream)t->t.get_stream();
+}
+
 void stf_task_destroy(stf_task_handle t)
 {
   assert(t);
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index a1f94b955cf..5825dcf4365 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -7,11 +7,16 @@
 # static type checker tools like mypy green-lights cuda.cccl.parallel
 
 from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release
+from cpython.buffer cimport Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyObject_GetBuffer, PyBuffer_Release
 from cpython.bytes cimport PyBytes_FromStringAndSize
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset, memcpy
 
+import numpy as np
+from numba import cuda
+
+
 from cpython.buffer cimport (
     Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
     PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer
@@ -33,6 +38,9 @@ cdef extern from "<cuda.h>":
     ctypedef OpaqueCUkernel_st *CUkernel
     ctypedef OpaqueCUlibrary_st *CUlibrary
 
+#typedef struct CUstream_st* cudaStream_t;
+
+
 cdef extern from "cccl/c/experimental/stf/stf.h":
     ctypedef struct stf_ctx_handle_t
     ctypedef stf_ctx_handle_t* stf_ctx_handle
@@ -52,8 +60,9 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
     void stf_task_start(stf_task_handle t)
     void stf_task_end(stf_task_handle t)
+    CUstream stf_task_get_custream(stf_task_handle t)
     # cudaStream_t stf_task_get_stream(stf_task_handle t)
-    void* stf_task_get(stf_task_handle t, size_t submitted_index)
+    void* stf_task_get(stf_task_handle t, int submitted_index)
     void stf_task_destroy(stf_task_handle t)
 
     cdef enum stf_access_mode:
@@ -68,15 +77,39 @@ class AccessMode(IntFlag):
     WRITE = STF_WRITE
     RW    = STF_RW
 
+class stf_arg_cai:
+    def __init__(self, ptr, tuple shape, dtype, stream=0):
+        self.ptr = ptr               # integer device pointer
+        self.shape = shape
+        self.dtype = np.dtype(dtype)
+        self.stream = stream        # CUDA stream handle (int or 0)
+        self.__cuda_array_interface__ = {
+            'version': 2,
+            'shape': self.shape,
+            'typestr': self.dtype.str,     # e.g., '<f4' for float32
+            'data': (self.ptr, False),     # (ptr, read-only?)
+            'strides': None,               # or tuple of strides in bytes
+            'stream': self.stream,         # CUDA stream for access
+        }
+
 cdef class logical_data:
     cdef stf_logical_data_handle _ld
 
+    cdef object _dtype
+    cdef tuple  _shape
+    cdef int    _ndim
+
     def __cinit__(self, context ctx, object buf):
         cdef Py_buffer view
-        if PyObject_GetBuffer(buf, &view, PyBUF_SIMPLE) != 0:
-            raise ValueError("object doesn’t support the buffer protocol")
+        cdef int flags = PyBUF_FORMAT | PyBUF_ND          # request dtype + shape
+
+        if PyObject_GetBuffer(buf, &view, flags) != 0:
+            raise ValueError("object doesn’t support the full buffer protocol")
 
         try:
+            self._ndim  = view.ndim
+            self._shape = tuple(<Py_ssize_t>view.shape[i] for i in range(view.ndim))
+            self._dtype = np.dtype(view.format)
             stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len)
 
         finally:
@@ -90,6 +123,18 @@ cdef class logical_data:
             stf_logical_data_destroy(self._ld)
             self._ld = NULL
 
+    @property
+    def dtype(self):
+        """Return the dtype of the logical data."""
+        return self._dtype
+
+    @property
+    def shape(self):
+        """Return the shape of the logical data."""
+        return self._shape
+
+
+
 class dep:
     __slots__ = ("ld", "mode")
     def __init__(self, logical_data ld, int mode):
@@ -109,12 +154,18 @@ def rw(ld):     return dep(ld, AccessMode.RW.value)
 cdef class task:
     cdef stf_task_handle _t
 
+    # list of logical data in deps: we need this because we can't exchange
+    # dtype/shape easily through the C API of STF
+    cdef list _lds_args
+
     def __cinit__(self, context ctx):
         stf_task_create(ctx._ctx, &self._t)
+        self._lds_args = []
 
     def __dealloc__(self):
         if self._t != NULL:
              stf_task_destroy(self._t)
+#        self._lds_args.clear()
 
     def start(self):
         stf_task_start(self._t)
@@ -135,6 +186,28 @@ cdef class task:
 
         stf_task_add_dep(self._t, ldata._ld, mode_ce)
 
+        self._lds_args.append(ldata)
+
+    def stream_ptr(self) -> int:
+        """
+        Return the raw CUstream pointer as a Python int
+        (memory address).  Suitable for ctypes or PyCUDA.
+        """
+        cdef CUstream s = stf_task_get_custream(self._t)
+        return <uintptr_t> s         # cast pointer -> Py int
+
+    def get_arg(self, index) -> int:
+        cdef void *ptr = stf_task_get(self._t, index)
+        return <uintptr_t>ptr
+
+    def get_arg_cai(self, index):
+        ptr = self.get_arg(index)
+        return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=0).__cuda_array_interface__
+
+    def get_arg_numba(self, index):
+        cai = self.get_arg_cai(index)
+        return cuda.from_cuda_array_interface(cai, owner=None, sync=False)
+
     # ---- context‑manager helpers -------------------------------
     def __enter__(self):
         self.start()
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
new file mode 100644
index 00000000000..07c724bb020
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write
+import ctypes
+import numpy as np
+from numba import cuda
+from numba.cuda.cudadrv import driver, devicearray
+
+@cuda.jit
+def axpy(a, x, y):
+    i = cuda.grid(1)
+    if i < x.size:
+        y[i] = a * x[i] + y[i]
+
+@cuda.jit
+def scale(a, x):
+    i = cuda.grid(1)
+    if i < x.size:
+        x[i] = a * x[i]
+
+def test_numba():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Y)
+
+    with ctx.task(rw(lX)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        # dX = t.get_arg_numba(0)
+        dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+        scale[32, 64, nb_stream](2.0, dX)
+        pass
+
+    with ctx.task(read(lX), rw(lY)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        print(nb_stream)
+        dX = t.get_arg_numba(0)
+        dY = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dY)
+        pass
+
+    with ctx.task(read(lX), rw(lZ)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dZ)
+        pass
+
+    with ctx.task(read(lY), rw(lZ)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dY, dZ)
+        pass
+
+    del ctx
+
+if __name__ == "__main__":
+    print("Running CUDASTF examples...")
+    test_numba()

From f2f7dfb93e4b1e70f0a2ad4b820fecc7c1f47e95 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 24 Jul 2025 23:36:35 +0200
Subject: [PATCH 048/221] pre-commit

---
 c/experimental/stf/src/stf.cu            |  2 +-
 python/cuda_cccl/tests/stf/test_numba.py | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 2d31ea6907a..d040f471195 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -121,7 +121,7 @@ void stf_task_end(stf_task_handle t)
 CUstream stf_task_get_custream(stf_task_handle t)
 {
   assert(t);
-  return (CUstream)t->t.get_stream();
+  return (CUstream) t->t.get_stream();
 }
 
 void stf_task_destroy(stf_task_handle t)
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 07c724bb020..d40b1a45c30 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -2,11 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import logical_data, context, AccessMode, read, rw, write
-import ctypes
 import numpy as np
 from numba import cuda
-from numba.cuda.cudadrv import driver, devicearray
+
+from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw
+
 
 @cuda.jit
 def axpy(a, x, y):
@@ -14,12 +14,14 @@ def axpy(a, x, y):
     if i < x.size:
         y[i] = a * x[i] + y[i]
 
+
 @cuda.jit
 def scale(a, x):
     i = cuda.grid(1)
     if i < x.size:
         x[i] = a * x[i]
 
+
 def test_numba():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
@@ -28,7 +30,7 @@ def test_numba():
     ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
-    lZ = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
 
     with ctx.task(rw(lX)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
@@ -61,6 +63,7 @@ def test_numba():
 
     del ctx
 
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_numba()

From 97c5f3a1aaa63b6cd79e663358e921ff452c8e30 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 25 Jul 2025 00:30:27 +0200
Subject: [PATCH 049/221] pre-commit

---
 .../experimental/stf/_stf_bindings_impl.pyx   |  3 +
 python/cuda_cccl/tests/stf/test_numba.py      | 78 ++++++++++++++++++-
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 5825dcf4365..811019231f5 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -227,6 +227,9 @@ cdef class context:
         stf_ctx_create(&self._ctx)
 
     def __dealloc__(self):
+        self.finalize()
+
+    def finalize(self):
         if self._ctx != NULL:
             stf_ctx_finalize(self._ctx)
             self._ctx = NULL
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index d40b1a45c30..ce4d13e079f 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -5,7 +5,7 @@
 import numpy as np
 from numba import cuda
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw
+from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw, write
 
 
 @cuda.jit
@@ -61,9 +61,81 @@ def test_numba():
         axpy[32, 64, nb_stream](2.0, dY, dZ)
         pass
 
-    del ctx
+@cuda.jit
+def laplacian_5pt_kernel(u_in, u_out, dx, dy):
+    """
+    Compute a 5‑point Laplacian on u_in and write the result to u_out.
+
+    Grid‑stride 2‑D kernel.  Assumes C‑contiguous (row‑major) inputs.
+    Boundary cells are copied unchanged.
+    """
+    coef_x = 1.0 / (dx * dx)
+    coef_y = 1.0 / (dy * dy)
+
+    i, j = cuda.grid(2)           # i ↔ row (x‑index), j ↔ col (y‑index)
+    nx, ny = u_in.shape
+
+    if i >= nx or j >= ny:
+        return                    # out‑of‑bounds threads do nothing
+
+    if 0 < i < nx - 1 and 0 < j < ny - 1:
+        u_out[i, j] = (
+            (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x +
+            (u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]) * coef_y
+        )
+    else:
+        # simple Dirichlet/Neumann placeholder: copy input to output
+        u_out[i, j] = u_in[i, j]
+
+def test_numba2d():
+    nx, ny = 1024, 1024
+    dx = 2.0 * np.pi / (nx - 1)
+    dy = 2.0 * np.pi / (ny - 1)
+
+    # a smooth test field: f(x,y) = sin(x) * cos(y)
+    x = np.linspace(0, 2*np.pi, nx, dtype=np.float64)
+    y = np.linspace(0, 2*np.pi, ny, dtype=np.float64)
+
+    u = np.sin(x)[:, None] * np.cos(y)[None, :]   # shape = (nx, ny)
+    u_out = np.zeros_like(u)
+
+    ctx = context()
+    lu = ctx.logical_data(u)
+    lu_out = ctx.logical_data(u_out)
 
+    with ctx.task(read(lu), write(lu_out)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        du = t.get_arg_numba(0)
+        du_out = t.get_arg_numba(1)
+        threads_per_block = (16, 16)    # 256 threads per block is a solid starting point
+        blocks_per_grid = (
+            (nx + threads_per_block[0] - 1) // threads_per_block[0],
+            (ny + threads_per_block[1] - 1) // threads_per_block[1],
+        )
+        laplacian_5pt_kernel[blocks_per_grid, threads_per_block,nb_stream](du, du_out, dx, dy)
+        pass 
+
+    ctx.finalize()
+
+    u_out_ref = np.zeros_like(u) 
+
+    for i in range(1, nx - 1):             # skip boundaries
+        for j in range(1, ny - 1):
+            u_out_ref[i, j] = (
+                (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 +
+                (u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]) / dy**2
+            )
+
+    # copy boundaries
+    u_out_ref[0, :]  = u[0, :]
+    u_out_ref[-1, :] = u[-1, :]
+    u_out_ref[:, 0]  = u[:, 0]
+    u_out_ref[:, -1] = u[:, -1]
+
+    # compare with the GPU result
+    max_abs_diff = np.abs(u_out - u_out_ref).max()
+    print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    test_numba()
+    test_numba2d()

From a5d669d39a3b148b598d5e2d62b322400c8bb844 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 25 Jul 2025 00:30:52 +0200
Subject: [PATCH 050/221] pre-commit and stencil test

---
 python/cuda_cccl/tests/stf/test_numba.py | 43 +++++++++++++-----------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index ce4d13e079f..f6096c61b68 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -61,6 +61,7 @@ def test_numba():
         axpy[32, 64, nb_stream](2.0, dY, dZ)
         pass
 
+
 @cuda.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):
     """
@@ -72,31 +73,31 @@ def laplacian_5pt_kernel(u_in, u_out, dx, dy):
     coef_x = 1.0 / (dx * dx)
     coef_y = 1.0 / (dy * dy)
 
-    i, j = cuda.grid(2)           # i ↔ row (x‑index), j ↔ col (y‑index)
+    i, j = cuda.grid(2)  # i ↔ row (x‑index), j ↔ col (y‑index)
     nx, ny = u_in.shape
 
     if i >= nx or j >= ny:
-        return                    # out‑of‑bounds threads do nothing
+        return  # out‑of‑bounds threads do nothing
 
     if 0 < i < nx - 1 and 0 < j < ny - 1:
-        u_out[i, j] = (
-            (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x +
-            (u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]) * coef_y
-        )
+        u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + (
+            u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]
+        ) * coef_y
     else:
         # simple Dirichlet/Neumann placeholder: copy input to output
         u_out[i, j] = u_in[i, j]
 
+
 def test_numba2d():
     nx, ny = 1024, 1024
     dx = 2.0 * np.pi / (nx - 1)
     dy = 2.0 * np.pi / (ny - 1)
 
     # a smooth test field: f(x,y) = sin(x) * cos(y)
-    x = np.linspace(0, 2*np.pi, nx, dtype=np.float64)
-    y = np.linspace(0, 2*np.pi, ny, dtype=np.float64)
+    x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64)
+    y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64)
 
-    u = np.sin(x)[:, None] * np.cos(y)[None, :]   # shape = (nx, ny)
+    u = np.sin(x)[:, None] * np.cos(y)[None, :]  # shape = (nx, ny)
     u_out = np.zeros_like(u)
 
     ctx = context()
@@ -107,35 +108,37 @@ def test_numba2d():
         nb_stream = cuda.external_stream(t.stream_ptr())
         du = t.get_arg_numba(0)
         du_out = t.get_arg_numba(1)
-        threads_per_block = (16, 16)    # 256 threads per block is a solid starting point
+        threads_per_block = (16, 16)  # 256 threads per block is a solid starting point
         blocks_per_grid = (
             (nx + threads_per_block[0] - 1) // threads_per_block[0],
             (ny + threads_per_block[1] - 1) // threads_per_block[1],
         )
-        laplacian_5pt_kernel[blocks_per_grid, threads_per_block,nb_stream](du, du_out, dx, dy)
-        pass 
+        laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream](
+            du, du_out, dx, dy
+        )
+        pass
 
     ctx.finalize()
 
-    u_out_ref = np.zeros_like(u) 
+    u_out_ref = np.zeros_like(u)
 
-    for i in range(1, nx - 1):             # skip boundaries
+    for i in range(1, nx - 1):  # skip boundaries
         for j in range(1, ny - 1):
-            u_out_ref[i, j] = (
-                (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 +
-                (u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]) / dy**2
-            )
+            u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + (
+                u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]
+            ) / dy**2
 
     # copy boundaries
-    u_out_ref[0, :]  = u[0, :]
+    u_out_ref[0, :] = u[0, :]
     u_out_ref[-1, :] = u[-1, :]
-    u_out_ref[:, 0]  = u[:, 0]
+    u_out_ref[:, 0] = u[:, 0]
     u_out_ref[:, -1] = u[:, -1]
 
     # compare with the GPU result
     max_abs_diff = np.abs(u_out - u_out_ref).max()
     print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
 
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_numba2d()

From 686b9880eeadec8f83f443e0ebe5b86e656d20a0 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 25 Jul 2025 08:29:20 +0200
Subject: [PATCH 051/221] make it possible to create a graph_ctx

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h  | 2 ++
 c/experimental/stf/src/stf.cu                             | 8 ++++++++
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx     | 8 ++++++--
 python/cuda_cccl/tests/stf/test_context.py                | 5 +++++
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 639b02b503f..0e515b80eb7 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -18,6 +18,8 @@ typedef enum stf_access_mode
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
+// TODO stf_ctx_create_with_flags and an enum instead ?
+void stf_ctx_create_graph(stf_ctx_handle* ctx);
 void stf_ctx_finalize(stf_ctx_handle ctx);
 
 // TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ...
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index d040f471195..3c305d04dd2 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -30,6 +30,14 @@ void stf_ctx_create(stf_ctx_handle* ctx)
   }
 }
 
+void stf_ctx_create_graph(stf_ctx_handle* ctx)
+{
+  if (ctx)
+  {
+    *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
+  }
+}
+
 void stf_ctx_finalize(stf_ctx_handle ctx)
 {
   ctx->ctx.finalize();
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 811019231f5..9597d199d33 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -45,6 +45,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     ctypedef struct stf_ctx_handle_t
     ctypedef stf_ctx_handle_t* stf_ctx_handle
     void stf_ctx_create(stf_ctx_handle* ctx)
+    void stf_ctx_create_graph(stf_ctx_handle* ctx)
     void stf_ctx_finalize(stf_ctx_handle ctx)
 
     ctypedef struct stf_logical_data_handle_t
@@ -223,8 +224,11 @@ cdef class task:
 cdef class context:
     cdef stf_ctx_handle _ctx
 
-    def __cinit__(self):
-        stf_ctx_create(&self._ctx)
+    def __cinit__(self, bint use_graph=False):
+        if use_graph:
+            stf_ctx_create_graph(&self._ctx)
+        else:
+            stf_ctx_create(&self._ctx)
 
     def __dealloc__(self):
         self.finalize()
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index 79f443ac2cd..5a29199dc67 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -12,6 +12,11 @@ def test_ctx():
     del ctx
 
 
+def test_graph_ctx():
+    ctx = context(use_graph=True)
+    ctx.finalize()
+
+
 def test_ctx2():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)

From b4688fdd76a7f69adecc6458f56dbd55ec0178e9 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 2 Aug 2025 12:41:40 +0200
Subject: [PATCH 052/221] implement set_exec_place for cuda_kernel and unified
 tasks

---
 .../experimental/__stf/internal/context.cuh   | 32 +++++++++++++++++++
 .../__stf/internal/cuda_kernel_scope.cuh      |  6 ++++
 2 files changed, 38 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index ff6b69859b4..6487356f61d 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -94,6 +94,22 @@ class context
       };
     }
 
+    auto&& set_exec_place(exec_place e_place) &
+    {
+      payload->*[&](auto& self) {
+        self.set_exec_place(mv(e_place));
+      };
+      return *this;
+    }
+
+    auto&& set_exec_place(exec_place e_place) &&
+    {
+      payload->*[&](auto& self) {
+        self.set_exec_place(mv(e_place));
+      };
+      return mv(*this);
+    }
+
     auto& set_symbol(::std::string s) &
     {
       payload->*[&](auto& self) {
@@ -195,6 +211,22 @@ public:
       return mv(*this);
     }
 
+    auto&& set_exec_place(exec_place e_place) &
+    {
+      payload->*[&](auto& self) {
+        self.set_exec_place(mv(e_place));
+      };
+      return *this;
+    }
+
+    auto&& set_exec_place(exec_place e_place) &&
+    {
+      payload->*[&](auto& self) {
+        self.set_exec_place(mv(e_place));
+      };
+      return mv(*this);
+    }
+
     auto& start()
     {
       payload->*[&](auto& self) {
diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index f22ec6a5da0..66b54d4becc 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -299,6 +299,12 @@ public:
   // move-constructible
   cuda_kernel_scope(cuda_kernel_scope&&) = default;
 
+  auto& set_exec_place(exec_place e_place_)
+  {
+    e_place = mv(e_place_);
+    return *this;
+  }
+
   /// Add a set of dependencies
   template <typename... Pack>
   void add_deps(task_dep_untyped first, Pack&&... pack)

From bd474d6ca8025121637f401f639c81f7584ea7f9 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 2 Aug 2025 12:42:10 +0200
Subject: [PATCH 053/221] Define some execution places in the C API

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 44 +++++++++++++++++++
 c/experimental/stf/src/stf.cu                 | 24 ++++++++++
 2 files changed, 68 insertions(+)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 0e515b80eb7..592c3bb8ac1 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -15,6 +15,48 @@ typedef enum stf_access_mode
   STF_RW    = STF_READ | STF_WRITE
 } stf_access_mode;
 
+struct stf_exec_place_device
+{
+  int dev_id;
+};
+
+struct stf_exec_place_host
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+typedef enum stf_exec_place_kind
+{
+  STF_EXEC_PLACE_DEVICE,
+  STF_EXEC_PLACE_HOST
+} stf_exec_place_kind;
+
+struct stf_exec_place
+{
+  enum stf_exec_place_kind kind;
+  union
+  {
+    struct stf_exec_place_device device;
+    struct stf_exec_place_host host;
+  } u;
+};
+
+static inline struct stf_exec_place make_device_place(int dev_id)
+{
+  struct stf_exec_place p;
+  p.kind            = STF_EXEC_PLACE_DEVICE;
+  p.u.device.dev_id = dev_id;
+  return p;
+}
+
+static inline struct stf_exec_place make_host_place()
+{
+  struct stf_exec_place p;
+  p.kind         = STF_EXEC_PLACE_HOST;
+  p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
@@ -41,6 +83,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 typedef struct stf_task_handle_t* stf_task_handle;
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
 void stf_task_start(stf_task_handle t);
@@ -52,6 +95,7 @@ void stf_task_destroy(stf_task_handle t);
 typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 3c305d04dd2..a9e3e019734 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -83,6 +83,18 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
   *ld = new stf_logical_data_handle_t{ctx->ctx.token()};
 }
 
+/* Convert the C-API stf_exec_place to a C++ exec_place object */
+exec_place to_exec_place(stf_exec_place* exec_p)
+{
+  if (exec_p->kind == STF_EXEC_PLACE_HOST)
+  {
+    return exec_place::host();
+  }
+
+  assert(exec_p->kind == STF_EXEC_PLACE_DEVICE);
+  return exec_place::device(exec_p->u.device.dev_id);
+}
+
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
 {
   assert(t);
@@ -91,6 +103,12 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
   *t = new stf_task_handle_t{ctx->ctx.task()};
 }
 
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
+{
+  assert(t);
+  t->t.set_exec_place(to_exec_place(exec_p));
+}
+
 void stf_task_set_symbol(stf_task_handle t, const char* symbol)
 {
   assert(t);
@@ -170,6 +188,12 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
   *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
 }
 
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p)
+{
+  assert(k);
+  k->k.set_exec_place(to_exec_place(exec_p));
+}
+
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
 {
   assert(k);

From 0b6e93a5dd04088eca1787466d75016a7ae884dc Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 2 Aug 2025 19:43:59 +0200
Subject: [PATCH 054/221] WIP: start to support execution places

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  9 +-
 c/experimental/stf/src/stf.cu                 |  6 +-
 .../experimental/stf/_stf_bindings_impl.pyx   | 82 ++++++++++++++++++-
 python/cuda_cccl/tests/stf/test_numba.py      | 50 ++++++++++-
 4 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 592c3bb8ac1..97cf89e5261 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -57,6 +57,11 @@ static inline struct stf_exec_place make_host_place()
   return p;
 }
 
+typedef struct stf_exec_place_device stf_exec_place_device;
+typedef struct stf_exec_place_host stf_exec_place_host;
+typedef union stf_exec_place_u stf_exec_place_u;
+typedef struct stf_exec_place stf_exec_place;
+
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
@@ -83,7 +88,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 typedef struct stf_task_handle_t* stf_task_handle;
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
-void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
+void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p);
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
 void stf_task_start(stf_task_handle t);
@@ -95,7 +100,7 @@ void stf_task_destroy(stf_task_handle t);
 typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p);
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index a9e3e019734..a879b8f8859 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -84,7 +84,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 }
 
 /* Convert the C-API stf_exec_place to a C++ exec_place object */
-exec_place to_exec_place(stf_exec_place* exec_p)
+exec_place to_exec_place(struct stf_exec_place* exec_p)
 {
   if (exec_p->kind == STF_EXEC_PLACE_HOST)
   {
@@ -103,7 +103,7 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
   *t = new stf_task_handle_t{ctx->ctx.task()};
 }
 
-void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
+void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p)
 {
   assert(t);
   t->t.set_exec_place(to_exec_place(exec_p));
@@ -188,7 +188,7 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
   *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
 }
 
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p)
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p)
 {
   assert(k);
   k->k.set_exec_place(to_exec_place(exec_p));
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 9597d199d33..eb5e6b48252 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -42,12 +42,39 @@ cdef extern from "<cuda.h>":
 
 
 cdef extern from "cccl/c/experimental/stf/stf.h":
+    #
+    # Contexts
+    #
     ctypedef struct stf_ctx_handle_t
     ctypedef stf_ctx_handle_t* stf_ctx_handle
     void stf_ctx_create(stf_ctx_handle* ctx)
     void stf_ctx_create_graph(stf_ctx_handle* ctx)
     void stf_ctx_finalize(stf_ctx_handle ctx)
 
+    #
+    # Exec places
+    #
+    ctypedef enum stf_exec_place_kind:
+        STF_EXEC_PLACE_DEVICE
+        STF_EXEC_PLACE_HOST
+
+    ctypedef struct stf_exec_place_device:
+        int dev_id
+
+    ctypedef struct stf_exec_place_host:
+        int dummy
+
+    ctypedef union stf_exec_place_u:
+        stf_exec_place_device device
+        stf_exec_place_host   host
+
+    ctypedef struct stf_exec_place:
+        stf_exec_place_kind kind
+        stf_exec_place_u    u
+
+    stf_exec_place make_device_place(int  dev_id)
+    stf_exec_place make_host_place()
+
     ctypedef struct stf_logical_data_handle_t
     ctypedef stf_logical_data_handle_t* stf_logical_data_handle
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
@@ -57,6 +84,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     ctypedef struct stf_task_handle_t
     ctypedef stf_task_handle_t* stf_task_handle
     void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
+    void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
     void stf_task_set_symbol(stf_task_handle t, const char* symbol)
     void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
     void stf_task_start(stf_task_handle t)
@@ -152,6 +180,36 @@ def read(ld):   return dep(ld, AccessMode.READ.value)
 def write(ld):  return dep(ld, AccessMode.WRITE.value)
 def rw(ld):     return dep(ld, AccessMode.RW.value)
 
+cdef class ExecPlace:
+    cdef stf_exec_place _c_place
+
+    def __cinit__(self):
+        # empty default constructor; never directly used
+        pass
+
+    @staticmethod
+    def device(int dev_id):
+        cdef ExecPlace p = ExecPlace.__new__(ExecPlace)
+        p._c_place = make_device_place(dev_id)
+        return p
+
+    @staticmethod
+    def host():
+        cdef ExecPlace p = ExecPlace.__new__(ExecPlace)
+        p._c_place = make_host_place()
+        return p
+
+    @property
+    def kind(self) -> str:
+        return ("device" if self._c_place.kind == STF_EXEC_PLACE_DEVICE
+                else "host")
+
+    @property
+    def device_id(self) -> int:
+        if self._c_place.kind != STF_EXEC_PLACE_DEVICE:
+            raise AttributeError("not a device execution place")
+        return self._c_place.u.device.dev_id
+
 cdef class task:
     cdef stf_task_handle _t
 
@@ -189,6 +247,13 @@ cdef class task:
 
         self._lds_args.append(ldata)
 
+    def set_exec_place(self, object exec_p):
+       if not isinstance(exec_p, ExecPlace):
+           raise TypeError("set_exec_place expects and ExecPlace argument")
+
+       cdef ExecPlace ep = <ExecPlace> exec_p
+       stf_task_set_exec_place(self._t, &ep._c_place)
+
     def stream_ptr(self) -> int:
         """
         Return the raw CUstream pointer as a Python int
@@ -249,7 +314,7 @@ cdef class context:
         """
         return logical_data(self, buf)
 
-    def task(self, *deps):
+    def task(self, *args):
         """
         Create a `task`
 
@@ -259,7 +324,18 @@ cdef class context:
         >>> t.start()
         >>> t.end()
         """
+        exec_place_set = False
         t = task(self)          # construct with this context
-        for d in deps:
-            t.add_dep(d)        # your existing add_dep logic
+        for d in args:
+            if isinstance(d, dep):
+                t.add_dep(d)
+            elif isinstance(d, ExecPlace):
+                if exec_place_set:
+                      raise ValueError("Only one ExecPlace can be given")
+                t.set_exec_place(d)
+                exec_place_set = True
+            else:
+                raise TypeError(
+                    "Arguments must be dependency objects or an ExecPlace"
+                )
         return t
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index f6096c61b68..3420036642b 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -5,7 +5,13 @@
 import numpy as np
 from numba import cuda
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw, write
+from cuda.cccl.experimental.stf._stf_bindings_impl import (
+    ExecPlace,
+    context,
+    read,
+    rw,
+    write,
+)
 
 
 @cuda.jit
@@ -139,6 +145,46 @@ def test_numba2d():
     print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
 
 
+def test_numba_exec_place():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(ExecPlace.device(0), rw(lX)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        # dX = t.get_arg_numba(0)
+        dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+        scale[32, 64, nb_stream](2.0, dX)
+        pass
+
+    with ctx.task(ExecPlace.device(0), read(lX), rw(lY)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        print(nb_stream)
+        dX = t.get_arg_numba(0)
+        dY = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dY)
+        pass
+
+    with ctx.task(ExecPlace.device(0), read(lX), rw(lZ)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dZ)
+        pass
+
+    with ctx.task(ExecPlace.device(0), read(lY), rw(lZ)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dY, dZ)
+        pass
+
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    test_numba2d()
+    test_numba_exec_place()

From ff9d70af26e8a904e65df8ae58e4905eed67ee82 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 00:26:50 +0200
Subject: [PATCH 055/221] set_exec_place should also set the data place

---
 cudax/include/cuda/experimental/__stf/internal/task.cuh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh
index d5ac78a1b8b..93d2e330c3d 100644
--- a/cudax/include/cuda/experimental/__stf/internal/task.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh
@@ -271,13 +271,16 @@ public:
   {
     return pimpl->e_place;
   }
+
   exec_place& get_exec_place()
   {
     return pimpl->e_place;
   }
+
   void set_exec_place(const exec_place& place)
   {
-    pimpl->e_place = place;
+    // This will both update the execution place and the affine data place
+    on(place);
   }
 
   /// Get and Set the affine data place of the task

From c610c42c8a70b1832aaf50617921a077c57578c5 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 01:09:52 +0200
Subject: [PATCH 056/221] rename ExecPlace to exec_place

---
 .../experimental/stf/_stf_bindings_impl.pyx    | 18 +++++++++---------
 python/cuda_cccl/tests/stf/test_numba.py       | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index eb5e6b48252..71767f84253 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -180,7 +180,7 @@ def read(ld):   return dep(ld, AccessMode.READ.value)
 def write(ld):  return dep(ld, AccessMode.WRITE.value)
 def rw(ld):     return dep(ld, AccessMode.RW.value)
 
-cdef class ExecPlace:
+cdef class exec_place:
     cdef stf_exec_place _c_place
 
     def __cinit__(self):
@@ -189,13 +189,13 @@ cdef class ExecPlace:
 
     @staticmethod
     def device(int dev_id):
-        cdef ExecPlace p = ExecPlace.__new__(ExecPlace)
+        cdef exec_place p = exec_place.__new__(exec_place)
         p._c_place = make_device_place(dev_id)
         return p
 
     @staticmethod
     def host():
-        cdef ExecPlace p = ExecPlace.__new__(ExecPlace)
+        cdef exec_place p = exec_place.__new__(exec_place)
         p._c_place = make_host_place()
         return p
 
@@ -248,10 +248,10 @@ cdef class task:
         self._lds_args.append(ldata)
 
     def set_exec_place(self, object exec_p):
-       if not isinstance(exec_p, ExecPlace):
-           raise TypeError("set_exec_place expects and ExecPlace argument")
+       if not isinstance(exec_p, exec_place):
+           raise TypeError("set_exec_place expects and exec_place argument")
 
-       cdef ExecPlace ep = <ExecPlace> exec_p
+       cdef exec_place ep = <exec_place> exec_p
        stf_task_set_exec_place(self._t, &ep._c_place)
 
     def stream_ptr(self) -> int:
@@ -329,13 +329,13 @@ cdef class context:
         for d in args:
             if isinstance(d, dep):
                 t.add_dep(d)
-            elif isinstance(d, ExecPlace):
+            elif isinstance(d, exec_place):
                 if exec_place_set:
-                      raise ValueError("Only one ExecPlace can be given")
+                      raise ValueError("Only one exec_place can be given")
                 t.set_exec_place(d)
                 exec_place_set = True
             else:
                 raise TypeError(
-                    "Arguments must be dependency objects or an ExecPlace"
+                    "Arguments must be dependency objects or an exec_place"
                 )
         return t
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 3420036642b..cf18b447c39 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -6,8 +6,8 @@
 from numba import cuda
 
 from cuda.cccl.experimental.stf._stf_bindings_impl import (
-    ExecPlace,
     context,
+    exec_place,
     read,
     rw,
     write,
@@ -155,14 +155,14 @@ def test_numba_exec_place():
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
-    with ctx.task(ExecPlace.device(0), rw(lX)) as t:
+    with ctx.task(exec_place.device(0), rw(lX)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         # dX = t.get_arg_numba(0)
         dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
         pass
 
-    with ctx.task(ExecPlace.device(0), read(lX), rw(lY)) as t:
+    with ctx.task(exec_place.device(0), read(lX), rw(lY)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         print(nb_stream)
         dX = t.get_arg_numba(0)
@@ -170,14 +170,14 @@ def test_numba_exec_place():
         axpy[32, 64, nb_stream](2.0, dX, dY)
         pass
 
-    with ctx.task(ExecPlace.device(0), read(lX), rw(lZ)) as t:
+    with ctx.task(exec_place.device(0), read(lX), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
         pass
 
-    with ctx.task(ExecPlace.device(0), read(lY), rw(lZ)) as t:
+    with ctx.task(exec_place.device(0), read(lY), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)

From f65702b7270b03ece2f76170e56a5ea3469e168d Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 02:02:28 +0200
Subject: [PATCH 057/221] Save WIP: start to implement data places (not
 compiling yet)

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 81 +++++++++++++++++++
 c/experimental/stf/src/stf.cu                 | 34 ++++++++
 .../experimental/stf/_stf_bindings_impl.pyx   | 57 ++++++++++++-
 python/cuda_cccl/tests/stf/test_numba.py      |  8 +-
 4 files changed, 172 insertions(+), 8 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 97cf89e5261..caa8769a8b6 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -62,6 +62,85 @@ typedef struct stf_exec_place_host stf_exec_place_host;
 typedef union stf_exec_place_u stf_exec_place_u;
 typedef struct stf_exec_place stf_exec_place;
 
+struct stf_data_place_device
+{
+  int dev_id;
+};
+
+struct stf_data_place_host
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+struct stf_data_place_managed
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+struct stf_data_place_affine
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+typedef enum stf_data_place_kind
+{
+  STF_DATA_PLACE_DEVICE,
+  STF_DATA_PLACE_HOST,
+  STF_DATA_PLACE_MANAGED,
+  STF_DATA_PLACE_AFFINE
+} stf_data_place_kind;
+
+struct stf_data_place
+{
+  enum stf_data_place_kind kind;
+  union
+  {
+    struct stf_data_place_device device;
+    struct stf_data_place_host host;
+    struct stf_data_place_managed managed;
+    struct stf_data_place_affine affine;
+  } u;
+};
+
+static inline struct stf_data_place make_device_data_place(int dev_id)
+{
+  struct stf_data_place p;
+  p.kind            = STF_DATA_PLACE_DEVICE;
+  p.u.device.dev_id = dev_id;
+  return p;
+}
+
+static inline struct stf_data_place make_host_data_place()
+{
+  struct stf_data_place p;
+  p.kind         = STF_DATA_PLACE_HOST;
+  p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+static inline struct stf_data_place make_managed_data_place()
+{
+  struct stf_data_place p;
+  p.kind            = STF_DATA_PLACE_MANAGED;
+  p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+static inline struct stf_data_place make_affine_data_place()
+{
+  struct stf_data_place p;
+  p.kind           = STF_DATA_PLACE_AFFINE;
+  p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+typedef struct stf_data_place_device stf_data_place_device;
+typedef struct stf_data_place_host stf_data_place_host;
+typedef struct stf_data_place_managed stf_data_place_managed;
+typedef struct stf_data_place_affine stf_data_place_affine;
+typedef union stf_data_place_u stf_data_place_u;
+typedef struct stf_data_place stf_data_place;
+
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
@@ -91,6 +170,8 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
 void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p);
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
+void stf_task_add_dep_with_dplace(
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p);
 void stf_task_start(stf_task_handle t);
 void stf_task_end(stf_task_handle t);
 CUstream stf_task_get_custream(stf_task_handle t);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index a879b8f8859..33b4cf92489 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -95,6 +95,30 @@ exec_place to_exec_place(struct stf_exec_place* exec_p)
   return exec_place::device(exec_p->u.device.dev_id);
 }
 
+/* Convert the C-API stf_data_place to a C++ data_place object */
+data_place to_data_place(struct stf_data_place* data_p)
+{
+  assert(data_p);
+
+  if (data_p->kind == STF_DATA_PLACE_HOST)
+  {
+    return data_place::host();
+  }
+
+  if (data_p->kind == STF_DATA_PLACE_MANAGED)
+  {
+    return data_place::managed();
+  }
+
+  if (data_p->kind == STF_DATA_PLACE_AFFINE)
+  {
+    return data_place::affine();
+  }
+
+  assert(data_p->kind == STF_DATA_PLACE_DEVICE);
+  return data_place::device(data_p->u.device.dev_id);
+}
+
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
 {
   assert(t);
@@ -125,6 +149,16 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
   t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
 }
 
+void stf_task_add_dep_with_dplace(
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p)
+{
+  assert(t);
+  assert(ld);
+  assert(data_p);
+
+  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p)));
+}
+
 void* stf_task_get(stf_task_handle t, int index)
 {
   assert(t);
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 71767f84253..85ffb87aae6 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -75,6 +75,42 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     stf_exec_place make_device_place(int  dev_id)
     stf_exec_place make_host_place()
 
+    #
+    # Data places
+    #
+    ctypedef enum stf_data_place_kind:
+        STF_DATA_PLACE_DEVICE
+        STF_DATA_PLACE_HOST
+        STF_DATA_PLACE_MANAGED
+        STF_DATA_PLACE_AFFINE
+
+    ctypedef struct stf_data_place_device:
+        int dev_id
+
+    ctypedef struct stf_data_place_host:
+        int dummy
+
+    ctypedef struct stf_data_place_managed:
+        int dummy
+
+    ctypedef struct stf_data_place_affine:
+        int dummy
+
+    ctypedef union stf_data_place_u:
+        stf_data_place_device device
+        stf_data_place_host   host
+        stf_data_place_managed   managed
+        stf_data_place_affine   affine
+
+    ctypedef struct stf_data_place:
+        stf_data_place_kind kind
+        stf_data_place_u    u
+
+    stf_data_place make_device_data_place(int  dev_id)
+    stf_data_place make_host_data_place()
+    stf_data_place make_managed_data_place()
+    stf_data_place make_affine_data_place()
+
     ctypedef struct stf_logical_data_handle_t
     ctypedef stf_logical_data_handle_t* stf_logical_data_handle
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
@@ -87,6 +123,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
     void stf_task_set_symbol(stf_task_handle t, const char* symbol)
     void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
+    void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p)
     void stf_task_start(stf_task_handle t)
     void stf_task_end(stf_task_handle t)
     CUstream stf_task_get_custream(stf_task_handle t)
@@ -162,20 +199,28 @@ cdef class logical_data:
         """Return the shape of the logical data."""
         return self._shape
 
+    def read(self):
+        return dep(self, AccessMode.READ.value)
 
+    def write(self):
+        return dep(self, AccessMode.WRITE.value)
+
+    def rw(self):
+        return dep(self, AccessMode.RW.value)
 
 class dep:
     __slots__ = ("ld", "mode")
-    def __init__(self, logical_data ld, int mode):
+    def __init__(self, logical_data ld, int mode, dplace=None):
         self.ld   = ld
         self.mode = mode
+        self.dplace = dplace  # can be None or a data place
     def __iter__(self):      # nice unpacking support
         yield self.ld
         yield self.mode
+        yield self.dplace
     def __repr__(self):
-        return f"dep({self.ld!r}, {self.mode})"
+        return f"dep({self.ld!r}, {self.mode}, {self.place!r})"
 
-# optional sugar
 def read(ld):   return dep(ld, AccessMode.READ.value)
 def write(ld):  return dep(ld, AccessMode.WRITE.value)
 def rw(ld):     return dep(ld, AccessMode.RW.value)
@@ -243,7 +288,11 @@ cdef class task:
         cdef int           mode_int  = int(d.mode)
         cdef stf_access_mode mode_ce = <stf_access_mode> mode_int
 
-        stf_task_add_dep(self._t, ldata._ld, mode_ce)
+        if d.dplace is None:
+            stf_task_add_dep(self._t, ldata._ld, mode_ce)
+        else:
+            cdef stf_data_place dplace = <stf_data_place> d.dplace
+            stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dplace)
 
         self._lds_args.append(ldata)
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index cf18b447c39..16fac79c3b0 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -155,14 +155,14 @@ def test_numba_exec_place():
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
-    with ctx.task(exec_place.device(0), rw(lX)) as t:
+    with ctx.task(exec_place.device(0), lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         # dX = t.get_arg_numba(0)
         dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
         pass
 
-    with ctx.task(exec_place.device(0), read(lX), rw(lY)) as t:
+    with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         print(nb_stream)
         dX = t.get_arg_numba(0)
@@ -170,14 +170,14 @@ def test_numba_exec_place():
         axpy[32, 64, nb_stream](2.0, dX, dY)
         pass
 
-    with ctx.task(exec_place.device(0), read(lX), rw(lZ)) as t:
+    with ctx.task(exec_place.device(0), lX.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
         pass
 
-    with ctx.task(exec_place.device(0), read(lY), rw(lZ)) as t:
+    with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)

From 21c94a6100915b37a46a1bda7764d3bd02365377 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 08:53:11 +0200
Subject: [PATCH 058/221] fix data places

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 58 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 85ffb87aae6..46a8fcb37d3 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -255,6 +255,59 @@ cdef class exec_place:
             raise AttributeError("not a device execution place")
         return self._c_place.u.device.dev_id
 
+cdef class data_place:
+    cdef stf_data_place _c_place
+
+    def __cinit__(self):
+        # empty default constructor; never directly used
+        pass
+
+    @staticmethod
+    def device(int dev_id):
+        cdef data_place p = data_place.__new__(exec_place)
+        p._c_place = make_device_data_place(dev_id)
+        return p
+
+    @staticmethod
+    def host():
+        cdef data_place p = data_place.__new__(exec_place)
+        p._c_place = make_host_data_place()
+        return p
+
+    @staticmethod
+    def managed():
+        cdef data_place p = data_place.__new__(exec_place)
+        p._c_place = make_managed_data_place()
+        return p
+
+    @staticmethod
+    def affine():
+        cdef data_place p = data_place.__new__(exec_place)
+        p._c_place = make_affine_data_place()
+        return p
+
+    @property
+    def kind(self) -> str:
+        cdef stf_data_place_kind k = self._c_place.kind
+        if k == STF_DATA_PLACE_DEVICE:
+            return "device"
+        elif k == STF_DATA_PLACE_HOST:
+            return "host"
+        elif k == STF_DATA_PLACE_MANAGED:
+            return "managed"
+        elif k == STF_DATA_PLACE_AFFINE:
+            return "affine"
+        else:
+            raise ValueError(f"Unknown data place kind: {k}")
+
+    @property
+    def device_id(self) -> int:
+        if self._c_place.kind != STF_DATA_PLACE_DEVICE:
+            raise AttributeError("not a device data place")
+        return self._c_place.u.device.dev_id
+
+
+
 cdef class task:
     cdef stf_task_handle _t
 
@@ -287,12 +340,13 @@ cdef class task:
         cdef logical_data ldata = <logical_data> d.ld
         cdef int           mode_int  = int(d.mode)
         cdef stf_access_mode mode_ce = <stf_access_mode> mode_int
+        cdef data_place dp
 
         if d.dplace is None:
             stf_task_add_dep(self._t, ldata._ld, mode_ce)
         else:
-            cdef stf_data_place dplace = <stf_data_place> d.dplace
-            stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dplace)
+            dp = <data_place> d.dplace
+            stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dp._c_place)
 
         self._lds_args.append(ldata)
 

From f863ecd97e3713287718365ff4301d9d6939fea4 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 09:02:30 +0200
Subject: [PATCH 059/221] Add data places in deps

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 28 +++++++++----------
 python/cuda_cccl/tests/stf/test_numba.py      |  5 +++-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 46a8fcb37d3..85585b507ee 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -199,17 +199,17 @@ cdef class logical_data:
         """Return the shape of the logical data."""
         return self._shape
 
-    def read(self):
-        return dep(self, AccessMode.READ.value)
+    def read(self, dplace=None):
+        return dep(self, AccessMode.READ.value, dplace)
 
-    def write(self):
-        return dep(self, AccessMode.WRITE.value)
+    def write(self, dplace=None):
+        return dep(self, AccessMode.WRITE.value, dplace)
 
-    def rw(self):
-        return dep(self, AccessMode.RW.value)
+    def rw(self, dplace=None):
+        return dep(self, AccessMode.RW.value, dplace)
 
 class dep:
-    __slots__ = ("ld", "mode")
+    __slots__ = ("ld", "mode", "dplace")
     def __init__(self, logical_data ld, int mode, dplace=None):
         self.ld   = ld
         self.mode = mode
@@ -221,9 +221,9 @@ class dep:
     def __repr__(self):
         return f"dep({self.ld!r}, {self.mode}, {self.place!r})"
 
-def read(ld):   return dep(ld, AccessMode.READ.value)
-def write(ld):  return dep(ld, AccessMode.WRITE.value)
-def rw(ld):     return dep(ld, AccessMode.RW.value)
+def read(ld, dplace=None):   return dep(ld, AccessMode.READ.value, dplace)
+def write(ld, dplace=None):  return dep(ld, AccessMode.WRITE.value, dplace)
+def rw(ld, dplace=None):     return dep(ld, AccessMode.RW.value, dplace)
 
 cdef class exec_place:
     cdef stf_exec_place _c_place
@@ -264,25 +264,25 @@ cdef class data_place:
 
     @staticmethod
     def device(int dev_id):
-        cdef data_place p = data_place.__new__(exec_place)
+        cdef data_place p = data_place.__new__(data_place)
         p._c_place = make_device_data_place(dev_id)
         return p
 
     @staticmethod
     def host():
-        cdef data_place p = data_place.__new__(exec_place)
+        cdef data_place p = data_place.__new__(data_place)
         p._c_place = make_host_data_place()
         return p
 
     @staticmethod
     def managed():
-        cdef data_place p = data_place.__new__(exec_place)
+        cdef data_place p = data_place.__new__(data_place)
         p._c_place = make_managed_data_place()
         return p
 
     @staticmethod
     def affine():
-        cdef data_place p = data_place.__new__(exec_place)
+        cdef data_place p = data_place.__new__(data_place)
         p._c_place = make_affine_data_place()
         return p
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 16fac79c3b0..d3292e2fe16 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -7,6 +7,7 @@
 
 from cuda.cccl.experimental.stf._stf_bindings_impl import (
     context,
+    data_place,
     exec_place,
     read,
     rw,
@@ -170,7 +171,9 @@ def test_numba_exec_place():
         axpy[32, 64, nb_stream](2.0, dX, dY)
         pass
 
-    with ctx.task(exec_place.device(0), lX.read(), lZ.rw()) as t:
+    with ctx.task(
+        exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed())
+    ) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)

From 11b66735b57f6f03d7f25ed56c6267f40777f8a0 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 3 Aug 2025 09:29:01 +0200
Subject: [PATCH 060/221] test with places

---
 python/cuda_cccl/tests/stf/test_numba.py | 39 ++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index d3292e2fe16..3a565b0a8d7 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -188,6 +188,45 @@ def test_numba_exec_place():
         pass
 
 
+def test_numba_places():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(lX.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = t.get_arg_numba(0)
+        scale[32, 64, nb_stream](2.0, dX)
+        pass
+
+    with ctx.task(lX.read(), lY.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        print(nb_stream)
+        dX = t.get_arg_numba(0)
+        dY = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dY)
+        pass
+
+    with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dX, dZ)
+        pass
+
+    with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY = t.get_arg_numba(0)
+        dZ = t.get_arg_numba(1)
+        axpy[32, 64, nb_stream](2.0, dY, dZ)
+        pass
+
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_numba_exec_place()

From e422712ccdb012e04fc8585eb14957664a42566a Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Mon, 4 Aug 2025 21:14:58 +0000
Subject: [PATCH 061/221] fix previous merge

---
 .../experimental/__stf/internal/cuda_kernel_scope.cuh     | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index b6c5864aec6..d06bfec0fdd 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -58,13 +58,7 @@ struct cuda_kernel_desc
   template <typename Fun, typename... Args>
   cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
   {
-    configure(mv(func), gridDim_, blockDim_, sharedMem_, ::std::forward<Args>(args)...);
-  }
-
-  template <typename Fun, typename... Args>
-  void configure(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
-  {
-    configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)...);
+    configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args));
   }
 
   template <typename Fun, typename... Args>

From 1bb8b4378770b984cee5f8ba51af24eee13d73a8 Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Mon, 4 Aug 2025 21:19:16 +0000
Subject: [PATCH 062/221] typo fix

---
 .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index d06bfec0fdd..809b217237d 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -58,7 +58,7 @@ struct cuda_kernel_desc
   template <typename Fun, typename... Args>
   cuda_kernel_desc(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, Args... args)
   {
-    configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args));
+    configure(mv(func), gridDim_, blockDim_, sharedMem_, mv(args)...);
   }
 
   template <typename Fun, typename... Args>

From fc8d5eb4ea870b3aa13b5b05667b551e84e01167 Mon Sep 17 00:00:00 2001
From: root <nvidia.com>
Date: Mon, 4 Aug 2025 22:19:27 +0000
Subject: [PATCH 063/221] Save WIP: try to implement a new decorator for STF

---
 .../cuda/cccl/experimental/stf/__init__.py    | 17 +++++
 .../cuda/cccl/experimental/stf/decorator.py   | 63 +++++++++++++++++++
 python/cuda_cccl/tests/stf/test_decorator.py  | 29 +++++++++
 3 files changed, 109 insertions(+)
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
 create mode 100644 python/cuda_cccl/tests/stf/test_decorator.py

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
new file mode 100644
index 00000000000..bef32849b2a
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
@@ -0,0 +1,17 @@
+from ._stf_bindings_impl import (
+    context,
+    dep,
+    exec_place,
+    data_place,
+)
+
+from .decorator import jit  # Python-side kernel launcher
+
+__all__ = [
+    "context",
+    "dep",
+    "exec_place",
+    "data_place",
+    "jit",
+]
+
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
new file mode 100644
index 00000000000..cd31c3d746d
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -0,0 +1,63 @@
+from numba import cuda
+from cuda.cccl.experimental.stf import context, dep, exec_place
+
+
+class _CudaSTFKernel:
+    def __init__(self, numba_kernel):
+        self._nkern = numba_kernel
+        self._launch_cfg = None  # (gridDim, blockDim, context, exec_place?)
+
+    def __getitem__(self, cfg):
+        if not (len(cfg) == 3 or len(cfg) == 4):
+            raise TypeError("use kernel[gridDim, blockDim, ctx (, exec_place)]")
+
+        gridDim, blockDim, ctx, *rest = cfg
+        if not isinstance(ctx, context):
+            raise TypeError("3rd item must be an STF context")
+
+        exec_pl = rest[0] if rest else None
+        if exec_pl and not isinstance(exec_pl, exec_place):
+            raise TypeError("4th item must be an exec_place")
+
+        self._launch_cfg = (int(gridDim), int(blockDim), ctx, exec_pl)
+        return self
+
+    def __call__(self, *args, **kwargs):
+        if self._launch_cfg is None:
+            raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](…)")
+
+        gridDim, blockDim, ctx, exec_pl = self._launch_cfg
+
+        dep_items = [(i, a) for i, a in enumerate(args) if isinstance(a, dep)]
+        if not dep_items:
+            raise TypeError("at least one argument must be an STF dep")
+
+        task_args = [exec_pl] if exec_pl else []
+        task_args.extend(a for _, a in dep_items)
+
+        with ctx.task(*task_args) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            dev_args = list(args)
+            for dep_index, (pos, _) in enumerate(dep_items):
+                dev_args[pos] = t.get_arg_numba(dep_index)
+
+            self._nkern[gridDim, blockDim, nb_stream](*dev_args, **kwargs)
+
+        return None
+
+
+def jit(*jit_args, **jit_kwargs):
+    if jit_args and callable(jit_args[0]):
+        pyfunc = jit_args[0]
+        return _build_kernel(pyfunc, (), **jit_kwargs)
+
+    def _decorator(fn):
+        return _build_kernel(fn, jit_args, **jit_kwargs)
+
+    return _decorator
+
+
+def _build_kernel(pyfunc, jit_args, **jit_kwargs):
+    numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc)
+    return _CudaSTFKernel(numba_kernel)
+
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
new file mode 100644
index 00000000000..269a7ebb70c
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -0,0 +1,29 @@
+import numpy as np
+from numba import cuda
+import cuda.cccl.experimental.stf as cudastf
+
+
+@cudastf.jit
+def axpy(a, x, y):
+    i = cuda.grid(1)
+    if i < x.size:
+        y[i] = a * x[i] + y[i]
+
+@cudastf.jit
+def scale(a, x):
+    i = cuda.grid(1)
+    if i < x.size:
+        x[i] = a * x[i]
+
+X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
+
+ctx  = cudastf.context()
+lX   = ctx.logical_data(X)
+lY   = ctx.logical_data(Y)
+lZ   = ctx.logical_data(Z)
+
+scale[32, 64, ctx](2.0, lX.rw())
+axpy[32, 64, ctx](2.0, lX.read(), lY.rw())                          # default device
+axpy[32, 64, ctx, exec_place.device(1)](2.0, lX.read(), lZ.rw())    # explicit exec place
+axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(data_place.device(1)))      # per-dep placement override
+

From 167f6c57344e971b25e5ff486c63f61cd30807c3 Mon Sep 17 00:00:00 2001
From: root <nvidia.com>
Date: Mon, 4 Aug 2025 22:39:57 +0000
Subject: [PATCH 064/221] fix typo

---
 .../cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 85585b507ee..01e3e2f0132 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -219,7 +219,7 @@ class dep:
         yield self.mode
         yield self.dplace
     def __repr__(self):
-        return f"dep({self.ld!r}, {self.mode}, {self.place!r})"
+        return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})"
 
 def read(ld, dplace=None):   return dep(ld, AccessMode.READ.value, dplace)
 def write(ld, dplace=None):  return dep(ld, AccessMode.WRITE.value, dplace)

From 95104efa1f1fade13f651fdf9e9c4dc9d669b426 Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Tue, 5 Aug 2025 08:08:29 +0000
Subject: [PATCH 065/221] Defer compilation until we know types

---
 .../cuda/cccl/experimental/stf/decorator.py   | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index cd31c3d746d..10b13b8ca4b 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -3,8 +3,11 @@
 
 
 class _CudaSTFKernel:
-    def __init__(self, numba_kernel):
-        self._nkern = numba_kernel
+    def __init__(self, pyfunc, jit_args, jit_kwargs):
+        self._pyfunc = pyfunc
+        self._jit_args = jit_args
+        self._jit_kwargs = jit_kwargs
+        self._compiled_kernel = None
         self._launch_cfg = None  # (gridDim, blockDim, context, exec_place?)
 
     def __getitem__(self, cfg):
@@ -28,20 +31,28 @@ def __call__(self, *args, **kwargs):
 
         gridDim, blockDim, ctx, exec_pl = self._launch_cfg
 
-        dep_items = [(i, a) for i, a in enumerate(args) if isinstance(a, dep)]
-        if not dep_items:
-            raise TypeError("at least one argument must be an STF dep")
+        dep_items = []
+        for i, a in enumerate(args):
+            print(f'got one arg {a} is dep ? {isinstance(a, dep)}')
+            if isinstance(a, dep):
+                dep_items.append((i, a))
 
         task_args = [exec_pl] if exec_pl else []
         task_args.extend(a for _, a in dep_items)
 
         with ctx.task(*task_args) as t:
-            nb_stream = cuda.external_stream(t.stream_ptr())
             dev_args = list(args)
+            print(dev_args)
             for dep_index, (pos, _) in enumerate(dep_items):
+                print(f'set arg {dep_index} at position {pos}')
                 dev_args[pos] = t.get_arg_numba(dep_index)
 
-            self._nkern[gridDim, blockDim, nb_stream](*dev_args, **kwargs)
+            if self._compiled_kernel is None:
+                print("compile kernel")
+                self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc)
+
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            self._compiled_kernel[grid, block, stream](*dev_args, **kwargs)
 
         return None
 
@@ -58,6 +69,5 @@ def _decorator(fn):
 
 
 def _build_kernel(pyfunc, jit_args, **jit_kwargs):
-    numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc)
-    return _CudaSTFKernel(numba_kernel)
+    return _CudaSTFKernel(pyfunc, jit_args, jit_kwargs)
 

From 920f335e07838a78e0f3e2fb52fa150e531616c7 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 6 Aug 2025 10:17:50 +0000
Subject: [PATCH 066/221] Add numba-cuda as a dependency

---
 python/cuda_cccl/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index a181dee4bfb..cce490aa9e3 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
   "nvidia-cuda-nvrtc-cu12",
   "nvidia-nvjitlink-cu12",
   "pynvjitlink-cu12>=0.2.4",
+  "numba-cuda",
 ]
 dynamic = ["version"]
 readme = { file = "README.md", content-type = "text/markdown" }

From 587f33bd56eee1cd177f7bd98349deecdca67791 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 6 Aug 2025 10:27:25 +0000
Subject: [PATCH 067/221] Replace use of pynvjitlink patch

---
 .../cuda_cccl/tests/cooperative/examples/block/reduce.py   | 3 +--
 python/cuda_cccl/tests/cooperative/examples/block/scan.py  | 3 +--
 python/cuda_cccl/tests/cooperative/examples/warp/reduce.py | 3 +--
 python/cuda_cccl/tests/cooperative/test_block_load.py      | 3 +--
 .../tests/cooperative/test_block_load_store_api.py         | 3 +--
 .../cuda_cccl/tests/cooperative/test_block_merge_sort.py   | 3 +--
 .../tests/cooperative/test_block_merge_sort_api.py         | 6 +++---
 .../cuda_cccl/tests/cooperative/test_block_radix_sort.py   | 3 +--
 .../tests/cooperative/test_block_radix_sort_api.py         | 4 ++--
 python/cuda_cccl/tests/cooperative/test_block_reduce.py    | 5 +----
 .../cuda_cccl/tests/cooperative/test_block_reduce_api.py   | 7 +++----
 python/cuda_cccl/tests/cooperative/test_block_scan.py      | 5 +----
 python/cuda_cccl/tests/cooperative/test_block_scan_api.py  | 6 ++----
 python/cuda_cccl/tests/cooperative/test_block_store.py     | 3 +--
 python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py | 3 +--
 .../tests/cooperative/test_warp_merge_sort_api.py          | 5 +----
 python/cuda_cccl/tests/cooperative/test_warp_reduce.py     | 5 +----
 python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py | 3 +--
 python/cuda_cccl/tests/cooperative/test_warp_scan.py       | 5 +----
 python/cuda_cccl/tests/cooperative/test_warp_scan_api.py   | 3 +--
 20 files changed, 26 insertions(+), 55 deletions(-)

diff --git a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
index 52bea0b7dc4..6daf679bd15 100644
--- a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
+++ b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
@@ -9,12 +9,11 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def custom_reduce_example():
diff --git a/python/cuda_cccl/tests/cooperative/examples/block/scan.py b/python/cuda_cccl/tests/cooperative/examples/block/scan.py
index ac01342d653..5b055fa8d6f 100644
--- a/python/cuda_cccl/tests/cooperative/examples/block/scan.py
+++ b/python/cuda_cccl/tests/cooperative/examples/block/scan.py
@@ -9,12 +9,11 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def exclusive_sum_multiple_items_example():
diff --git a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
index 3267d7c9d87..8afce70c665 100644
--- a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
+++ b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
@@ -9,12 +9,11 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def custom_warp_reduce_example():
diff --git a/python/cuda_cccl/tests/cooperative/test_block_load.py b/python/cuda_cccl/tests/cooperative/test_block_load.py
index 4765d19c8cf..79c2f4c522f 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_load.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_load.py
@@ -9,11 +9,10 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py b/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
index 2ec97fd78fd..96458fe3ed8 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
@@ -6,11 +6,10 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-end imports
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
diff --git a/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py b/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py
index 3c2bed0f70a..0bf30c939f9 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_merge_sort.py
@@ -10,11 +10,10 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
index 6c3113e672f..99f75255c69 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
@@ -5,14 +5,14 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+
+
 # example-end imports
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
index e65b73dd6e5..a7e65974a31 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
@@ -9,12 +9,11 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
index a90d17daf4e..9efa7ff24bf 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
@@ -5,14 +5,14 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+
 # example-end imports
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_block_reduce.py b/python/cuda_cccl/tests/cooperative/test_block_reduce.py
index 2ced3782261..d7bffc14f9f 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_reduce.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_reduce.py
@@ -16,14 +16,11 @@
     row_major_tid,
 )
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
-
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 @pytest.mark.parametrize(
diff --git a/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py b/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
index 54824f01f53..edd25e26a9c 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
@@ -6,15 +6,14 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
-# example-end imports
-
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
+# example-end imports
+
 
 def test_block_reduction():
     # example-begin reduce
diff --git a/python/cuda_cccl/tests/cooperative/test_block_scan.py b/python/cuda_cccl/tests/cooperative/test_block_scan.py
index c3d238b6452..36ad00db053 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_scan.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_scan.py
@@ -34,7 +34,6 @@
     type_callable,
     typeof_impl,
 )
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 from cuda.cccl.cooperative.experimental.block._block_scan import (
@@ -42,9 +41,7 @@
 )
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
-# Patching the Numba linker to enable LTO as needed.
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 class BlockPrefixCallbackOp:
diff --git a/python/cuda_cccl/tests/cooperative/test_block_scan_api.py b/python/cuda_cccl/tests/cooperative/test_block_scan_api.py
index cf222872903..d5cbc4f09f2 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_scan_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_scan_api.py
@@ -5,14 +5,12 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-end imports
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_block_store.py b/python/cuda_cccl/tests/cooperative/test_block_store.py
index fe1b19ed37d..de101df3d07 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_store.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_store.py
@@ -9,11 +9,10 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py
index 085e50f38d7..bda89598441 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort.py
@@ -6,11 +6,10 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
index 1f062ba53a7..c8cd3b7d544 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
@@ -5,14 +5,11 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
-numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-end imports
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_reduce.py b/python/cuda_cccl/tests/cooperative/test_warp_reduce.py
index 4d2c7c6502b..8900c281815 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_reduce.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_reduce.py
@@ -7,14 +7,11 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
-
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 @pytest.mark.parametrize("T", [types.uint32, types.uint64])
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py b/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
index 1c5845512c4..4c09fd26c14 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
@@ -5,14 +5,13 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-end imports
 
 
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan.py b/python/cuda_cccl/tests/cooperative/test_warp_scan.py
index 5f3e9a310d7..ea35ec77dd5 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_scan.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_scan.py
@@ -7,14 +7,11 @@
 import pytest
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
-
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 @pytest.mark.parametrize("T", [types.uint32, types.uint64])
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py b/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
index 5661635b3a7..108da881ef9 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
@@ -5,14 +5,13 @@
 import numba
 import numpy as np
 from numba import cuda
-from pynvjitlink import patch
 
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 # example-begin imports
-patch.patch_numba_linker(lto=True)
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 # example-end imports
 
 

From 9db83a2958d87dbeafe4cabbaa7d50c4de251d58 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 6 Aug 2025 08:30:25 -0400
Subject: [PATCH 068/221] Update pyproject.toml

There's a bug in cuda-bindings 12.9.0 that prevents us from using CUDA 13 driver
---
 python/cuda_cccl/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index cce490aa9e3..6b687933827 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -19,7 +19,7 @@ requires-python = ">=3.9"
 dependencies = [
   "numba>=0.60.0",
   "numpy",
-  "cuda-python==12.9.0",
+  "cuda-bindings>=12.9.1,<13.0.0",
   "cuda-core",
   "nvidia-cuda-nvrtc-cu12",
   "nvidia-nvjitlink-cu12",

From 865d337894e4558b1f2ca6b692a16bfee588c47b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <cedric.augonnet@gmail.com>
Date: Thu, 7 Aug 2025 10:52:55 +0200
Subject: [PATCH 069/221] better class name

---
 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index cd31c3d746d..8855b11efae 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -2,7 +2,7 @@
 from cuda.cccl.experimental.stf import context, dep, exec_place
 
 
-class _CudaSTFKernel:
+class stf_kernel_decorator:
     def __init__(self, numba_kernel):
         self._nkern = numba_kernel
         self._launch_cfg = None  # (gridDim, blockDim, context, exec_place?)
@@ -24,7 +24,7 @@ def __getitem__(self, cfg):
 
     def __call__(self, *args, **kwargs):
         if self._launch_cfg is None:
-            raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](…)")
+            raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](...)")
 
         gridDim, blockDim, ctx, exec_pl = self._launch_cfg
 
@@ -59,5 +59,5 @@ def _decorator(fn):
 
 def _build_kernel(pyfunc, jit_args, **jit_kwargs):
     numba_kernel = cuda.jit(*jit_args, **jit_kwargs)(pyfunc)
-    return _CudaSTFKernel(numba_kernel)
+    return stf_kernel_decorator(numba_kernel)
 

From d22396062ed19bd4f14db4b9a4ddd4aaccc900fb Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Thu, 7 Aug 2025 10:44:07 +0000
Subject: [PATCH 070/221] fixes to make cudastf.jit decorator work

---
 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 5 +++--
 python/cuda_cccl/tests/stf/test_decorator.py             | 4 ++--
 python/cuda_cccl/tests/stf/test_numba.py                 | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index b7ed155cd2f..87d286b7124 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -1,6 +1,7 @@
 from numba import cuda
+import numba
 from cuda.cccl.experimental.stf import context, dep, exec_place
-
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 class stf_kernel_decorator:
     def __init__(self, pyfunc, jit_args, jit_kwargs):
@@ -52,7 +53,7 @@ def __call__(self, *args, **kwargs):
                 self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc)
 
             nb_stream = cuda.external_stream(t.stream_ptr())
-            self._compiled_kernel[grid, block, stream](*dev_args, **kwargs)
+            self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs)
 
         return None
 
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 269a7ebb70c..1a6db0f721d 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -24,6 +24,6 @@ def scale(a, x):
 
 scale[32, 64, ctx](2.0, lX.rw())
 axpy[32, 64, ctx](2.0, lX.read(), lY.rw())                          # default device
-axpy[32, 64, ctx, exec_place.device(1)](2.0, lX.read(), lZ.rw())    # explicit exec place
-axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(data_place.device(1)))      # per-dep placement override
+axpy[32, 64, ctx, cudastf.exec_place.device(0)](2.0, lX.read(), lZ.rw())    # explicit exec place
+axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)))      # per-dep placement override
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 3a565b0a8d7..59737cd3060 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 from numba import cuda
+import numba
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 from cuda.cccl.experimental.stf._stf_bindings_impl import (
     context,

From 15c2db0d13b4618a66d617dff8d4816eea6e9545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <cedric.augonnet@gmail.com>
Date: Thu, 7 Aug 2025 23:07:59 +0200
Subject: [PATCH 071/221] revert some changes

---
 python/cuda_cccl/tests/cooperative/examples/block/reduce.py    | 1 -
 python/cuda_cccl/tests/cooperative/examples/block/scan.py      | 1 -
 python/cuda_cccl/tests/cooperative/examples/warp/reduce.py     | 1 -
 python/cuda_cccl/tests/cooperative/test_block_radix_sort.py    | 1 -
 python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py | 1 +
 python/cuda_cccl/tests/cooperative/test_warp_scan.py           | 1 +
 6 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
index 6daf679bd15..d6ac819a4d8 100644
--- a/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
+++ b/python/cuda_cccl/tests/cooperative/examples/block/reduce.py
@@ -13,7 +13,6 @@
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def custom_reduce_example():
diff --git a/python/cuda_cccl/tests/cooperative/examples/block/scan.py b/python/cuda_cccl/tests/cooperative/examples/block/scan.py
index 5b055fa8d6f..cae62454e04 100644
--- a/python/cuda_cccl/tests/cooperative/examples/block/scan.py
+++ b/python/cuda_cccl/tests/cooperative/examples/block/scan.py
@@ -13,7 +13,6 @@
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def exclusive_sum_multiple_items_example():
diff --git a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
index 8afce70c665..357efc13b42 100644
--- a/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
+++ b/python/cuda_cccl/tests/cooperative/examples/warp/reduce.py
@@ -13,7 +13,6 @@
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 def custom_warp_reduce_example():
diff --git a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
index a7e65974a31..bb51a020a7c 100644
--- a/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
+++ b/python/cuda_cccl/tests/cooperative/test_block_radix_sort.py
@@ -13,7 +13,6 @@
 import cuda.cccl.cooperative.experimental as coop
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
index 98f090b80f4..398abbc6a7b 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
@@ -8,6 +8,7 @@
 
 import cuda.cccl.cooperative.experimental as coop
 
+
 def test_warp_merge_sort():
     # example-begin merge-sort
     # Define comparison operator
diff --git a/python/cuda_cccl/tests/cooperative/test_warp_scan.py b/python/cuda_cccl/tests/cooperative/test_warp_scan.py
index 1b283bebab6..afb81d3fcaa 100644
--- a/python/cuda_cccl/tests/cooperative/test_warp_scan.py
+++ b/python/cuda_cccl/tests/cooperative/test_warp_scan.py
@@ -12,6 +12,7 @@
 
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
+
 @pytest.mark.parametrize("T", [types.uint32, types.uint64])
 def test_warp_exclusive_sum(T):
     warp_exclusive_sum = coop.warp.exclusive_sum(dtype=T)

From 011e2919dc5f873f3da065e1f18b9e9a0bdcc6e9 Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Thu, 7 Aug 2025 22:47:50 +0000
Subject: [PATCH 072/221] support tuple configs

---
 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 87d286b7124..01929e5d4d5 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -23,7 +23,13 @@ def __getitem__(self, cfg):
         if exec_pl and not isinstance(exec_pl, exec_place):
             raise TypeError("4th item must be an exec_place")
 
-        self._launch_cfg = (int(gridDim), int(blockDim), ctx, exec_pl)
+        self._launch_cfg = (
+            tuple(gridDim) if isinstance(gridDim, tuple) else (int(gridDim),),
+            tuple(blockDim) if isinstance(blockDim, tuple) else (int(blockDim),),
+            ctx,
+            exec_pl,
+        )
+
         return self
 
     def __call__(self, *args, **kwargs):

From 91e9d4698fdfbab0514110083e7ac6176faa0ccc Mon Sep 17 00:00:00 2001
From: root <caugonnet@nvidia.com>
Date: Thu, 7 Aug 2025 22:48:09 +0000
Subject: [PATCH 073/221] new test

---
 .../tests/stf/test_stencil_decorator.py       | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_stencil_decorator.py

diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
new file mode 100644
index 00000000000..9c0bc17182b
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -0,0 +1,75 @@
+import numpy as np
+from numba import cuda
+import cuda.cccl.experimental.stf as cudastf
+
+@cudastf.jit
+def laplacian_5pt_kernel(u_in, u_out, dx, dy):
+    """
+    Compute a 5?~@~Qpoint Laplacian on u_in and write the result to u_out.
+
+    Grid?~@~Qstride 2?~@~QD kernel.  Assumes C?~@~Qcontiguous (row?~@~Qmajor) inputs.
+    Boundary cells are copied unchanged.
+    """
+    coef_x = 1.0 / (dx * dx)
+    coef_y = 1.0 / (dy * dy)
+
+    i, j = cuda.grid(2)  # i ?~F~T row (x?~@~Qindex), j ?~F~T col (y?~@~Qindex)
+    nx, ny = u_in.shape
+
+    if i >= nx or j >= ny:
+        return  # out?~@~Qof?~@~Qbounds threads do nothing
+
+    if 0 < i < nx - 1 and 0 < j < ny - 1:
+        u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + (
+            u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]
+        ) * coef_y
+    else:
+        # simple Dirichlet/Neumann placeholder: copy input to output
+        u_out[i, j] = u_in[i, j]
+
+
+def test_numba2d():
+    nx, ny = 1024, 1024
+    dx = 2.0 * np.pi / (nx - 1)
+    dy = 2.0 * np.pi / (ny - 1)
+
+    # a smooth test field: f(x,y) = sin(x) * cos(y)
+    x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64)
+    y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64)
+
+    u = np.sin(x)[:, None] * np.cos(y)[None, :]  # shape = (nx, ny)
+    u_out = np.zeros_like(u)
+
+    ctx = cudastf.context()
+    lu = ctx.logical_data(u)
+    lu_out = ctx.logical_data(u_out)
+
+    threads_per_block = (16, 16)  # 256 threads per block is a solid starting point
+    blocks_per_grid = (
+        (nx + threads_per_block[0] - 1) // threads_per_block[0],
+        (ny + threads_per_block[1] - 1) // threads_per_block[1],
+    )
+
+    laplacian_5pt_kernel[blocks_per_grid, threads_per_block, ctx](
+        lu.read(), lu_out.write(), dx, dy
+    )
+
+    ctx.finalize()
+
+    u_out_ref = np.zeros_like(u)
+
+    for i in range(1, nx - 1):  # skip boundaries
+        for j in range(1, ny - 1):
+            u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + (
+                u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]
+            ) / dy**2
+
+    # copy boundaries
+    u_out_ref[0, :] = u[0, :]
+    u_out_ref[-1, :] = u[-1, :]
+    u_out_ref[:, 0] = u[:, 0]
+    u_out_ref[:, -1] = u[:, -1]
+
+    # compare with the GPU result
+    max_abs_diff = np.abs(u_out - u_out_ref).max()
+    print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")

From 8be7401a58c6dc72dd2e0b8caed55a1347082809 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 9 Aug 2025 09:50:21 +0200
Subject: [PATCH 074/221] Add a new test for places (C interface)

---
 c/experimental/stf/test/test_places.cpp | 81 +++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 c/experimental/stf/test/test_places.cpp

diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
new file mode 100644
index 00000000000..eeba229c758
--- /dev/null
+++ b/c/experimental/stf/test/test_places.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("empty stf tasks", "[task]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
+
+  stf_logical_data(ctx, &lX, X, N * sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+  stf_logical_data(ctx, &lZ, Z, N * sizeof(float));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  auto e_place_dev0 = make_device_place(0);
+  stf_task_set_exec_place(t3, &e_place_dev0);
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  auto d_place_dev0 = make_device_data_place(0);
+  stf_task_add_dep_with_dplace(t4, lZ, STF_RW, &d_place_dev0);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}

From a7da2554098c94ae6784d57c2975f81bd387d600 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sat, 9 Aug 2025 09:53:24 +0200
Subject: [PATCH 075/221] clang-format

---
 .../cuda/cccl/experimental/stf/__init__.py    |  4 +---
 .../cuda/cccl/experimental/stf/decorator.py   | 17 +++++++++-----
 python/cuda_cccl/tests/stf/test_decorator.py  | 22 ++++++++++++-------
 python/cuda_cccl/tests/stf/test_numba.py      |  3 ++-
 .../tests/stf/test_stencil_decorator.py       |  2 ++
 5 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
index bef32849b2a..ce203e09097 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
@@ -1,10 +1,9 @@
 from ._stf_bindings_impl import (
     context,
+    data_place,
     dep,
     exec_place,
-    data_place,
 )
-
 from .decorator import jit  # Python-side kernel launcher
 
 __all__ = [
@@ -14,4 +13,3 @@
     "data_place",
     "jit",
 ]
-
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 01929e5d4d5..42dfc5b774a 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -1,8 +1,11 @@
-from numba import cuda
 import numba
+from numba import cuda
+
 from cuda.cccl.experimental.stf import context, dep, exec_place
+
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
+
 class stf_kernel_decorator:
     def __init__(self, pyfunc, jit_args, jit_kwargs):
         self._pyfunc = pyfunc
@@ -34,13 +37,15 @@ def __getitem__(self, cfg):
 
     def __call__(self, *args, **kwargs):
         if self._launch_cfg is None:
-            raise RuntimeError("launch configuration missing – use kernel[grid, block, ctx](...)")
+            raise RuntimeError(
+                "launch configuration missing – use kernel[grid, block, ctx](...)"
+            )
 
         gridDim, blockDim, ctx, exec_pl = self._launch_cfg
 
         dep_items = []
         for i, a in enumerate(args):
-            print(f'got one arg {a} is dep ? {isinstance(a, dep)}')
+            print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
                 dep_items.append((i, a))
 
@@ -51,12 +56,14 @@ def __call__(self, *args, **kwargs):
             dev_args = list(args)
             print(dev_args)
             for dep_index, (pos, _) in enumerate(dep_items):
-                print(f'set arg {dep_index} at position {pos}')
+                print(f"set arg {dep_index} at position {pos}")
                 dev_args[pos] = t.get_arg_numba(dep_index)
 
             if self._compiled_kernel is None:
                 print("compile kernel")
-                self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(self._pyfunc)
+                self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(
+                    self._pyfunc
+                )
 
             nb_stream = cuda.external_stream(t.stream_ptr())
             self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs)
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 1a6db0f721d..29fc61280eb 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -1,5 +1,6 @@
 import numpy as np
 from numba import cuda
+
 import cuda.cccl.experimental.stf as cudastf
 
 
@@ -9,21 +10,26 @@ def axpy(a, x, y):
     if i < x.size:
         y[i] = a * x[i] + y[i]
 
+
 @cudastf.jit
 def scale(a, x):
     i = cuda.grid(1)
     if i < x.size:
         x[i] = a * x[i]
 
+
 X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
 
-ctx  = cudastf.context()
-lX   = ctx.logical_data(X)
-lY   = ctx.logical_data(Y)
-lZ   = ctx.logical_data(Z)
+ctx = cudastf.context()
+lX = ctx.logical_data(X)
+lY = ctx.logical_data(Y)
+lZ = ctx.logical_data(Z)
 
 scale[32, 64, ctx](2.0, lX.rw())
-axpy[32, 64, ctx](2.0, lX.read(), lY.rw())                          # default device
-axpy[32, 64, ctx, cudastf.exec_place.device(0)](2.0, lX.read(), lZ.rw())    # explicit exec place
-axpy[32, 64, ctx](2.0, lY.read(), lZ.rw(cudastf.data_place.device(0)))      # per-dep placement override
-
+axpy[32, 64, ctx](2.0, lX.read(), lY.rw())  # default device
+axpy[32, 64, ctx, cudastf.exec_place.device(0)](
+    2.0, lX.read(), lZ.rw()
+)  # explicit exec place
+axpy[32, 64, ctx](
+    2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))
+)  # per-dep placement override
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 59737cd3060..a77e771abe6 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import numba
 import numpy as np
 from numba import cuda
-import numba
+
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 from cuda.cccl.experimental.stf._stf_bindings_impl import (
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index 9c0bc17182b..ca3b68f2d17 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -1,7 +1,9 @@
 import numpy as np
 from numba import cuda
+
 import cuda.cccl.experimental.stf as cudastf
 
+
 @cudastf.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):
     """

From 537b3b931ebefe51bb8784ef17dde99f76850431 Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2070.cm.cluster>
Date: Mon, 25 Aug 2025 02:59:58 -0700
Subject: [PATCH 076/221] Skit test if we have less than 2 devices

---
 python/cuda_cccl/tests/stf/test_numba.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index a77e771abe6..b65391c63f4 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -4,6 +4,8 @@
 
 import numba
 import numpy as np
+import unittest
+import pytest
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -192,6 +194,10 @@ def test_numba_exec_place():
 
 
 def test_numba_places():
+    if len(list(cuda.gpus)) < 2:
+        pytest.skip("Need at least 2 GPUs")
+        return
+
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)

From d804d1b211f344f137d0cfc0937763bccc98705a Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-10017.cm.cluster>
Date: Mon, 25 Aug 2025 06:54:50 -0700
Subject: [PATCH 077/221] Save WIP for like_empty (broken)

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  1 +
 c/experimental/stf/src/stf.cu                 | 14 +++++++++++
 .../experimental/stf/_stf_bindings_impl.pyx   | 25 +++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index caa8769a8b6..64260f14bc6 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -158,6 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
 void stf_logical_data_destroy(stf_logical_data_handle ld);
+void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to);
 
 // TODO
 // void stf_logical_data_wait(stf_logical_data_handle ld);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 33b4cf92489..d6294953e17 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -75,6 +75,20 @@ void stf_logical_data_destroy(stf_logical_data_handle ld)
   delete ld;
 }
 
+void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to)
+{
+  assert(ctx);
+  assert(from);
+  assert(to);
+
+  auto ld_typed = ctx->ctx.logical_data(from->ld.shape());
+
+  // Stored in its untyped version
+  *to = new stf_logical_data_handle_t{ld_typed};
+}
+
+
+
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 {
   assert(ctx);
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 01e3e2f0132..3c75758cfb7 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -116,6 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
     void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
     void stf_logical_data_destroy(stf_logical_data_handle ld)
+    void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst)
 
     ctypedef struct stf_task_handle_t
     ctypedef stf_task_handle_t* stf_task_handle
@@ -208,6 +209,30 @@ cdef class logical_data:
     def rw(self, dplace=None):
         return dep(self, AccessMode.RW.value, dplace)
 
+    def like_empty(self):
+        """
+        Create a new logical_data with the same shape (and dtype metadata)
+        as this object.
+        """
+        if self._ld == NULL:
+            raise RuntimeError("source logical_data handle is NULL")
+
+        cdef logical_data out = logical_data.__new__(logical_data)
+
+        out._ctx   = self._ctx
+        out._dtype = self._dtype
+        out._shape = self._shape
+        out._ndim  = self._ndim
+
+        cdef stf_logical_data_handle new_ld = NULL
+        stf_logical_data_like_empty(self._ctx._ctx, &self._ld, &new_ld)
+
+        if new_ld == NULL:
+            raise RuntimeError("stf_logical_data_like_empty returned NULL")
+
+        out._ld = new_ld
+        return out
+
 class dep:
     __slots__ = ("ld", "mode", "dplace")
     def __init__(self, logical_data ld, int mode, dplace=None):

From ad83a6301f2f0dc98a8b4d43f042bce0fc48d3c6 Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-10017.cm.cluster>
Date: Mon, 25 Aug 2025 06:55:13 -0700
Subject: [PATCH 078/221] test with and witjout graphs

---
 python/cuda_cccl/tests/stf/test_numba.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index b65391c63f4..637b0f5a1ff 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -34,12 +34,13 @@ def scale(a, x):
         x[i] = a * x[i]
 
 
+@pytest.mark.parametrize("use_graph_val", [False, True])
 def test_numba():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)
 
-    ctx = context()
+    ctx = context(use_graph=use_graph_val)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)

From f74c1d47b81819d98f496a1b3b7b7204c669e12d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 Aug 2025 12:55:41 +0000
Subject: [PATCH 079/221] [pre-commit.ci] auto code formatting

---
 c/experimental/stf/src/stf.cu            | 2 --
 python/cuda_cccl/tests/stf/test_numba.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index d6294953e17..2a493a77528 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -87,8 +87,6 @@ void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_hand
   *to = new stf_logical_data_handle_t{ld_typed};
 }
 
-
-
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 {
   assert(ctx);
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 637b0f5a1ff..e4f4f66a0eb 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+
 import numba
 import numpy as np
-import unittest
 import pytest
 from numba import cuda
 

From 95c88a3e93eb245dc3974c6d734b6246534f525e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 26 Aug 2025 14:47:28 +0200
Subject: [PATCH 080/221] remove unit test

---
 .../experimental/__stf/internal/context.cuh     | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index c1075ac5b45..53ab63b2ad2 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -1451,23 +1451,6 @@ UNITTEST("make_tuple_indexwise")
   EXPECT(t2 == ::std::tuple(0, 2));
 };
 
-UNITTEST("auto_dump set/get")
-{
-  context ctx;
-
-  int A[1024];
-  int B[1024];
-  auto lA = ctx.logical_data(A);
-  auto lB = ctx.logical_data(B);
-
-  // Disable auto dump
-  lA.set_auto_dump(false);
-  EXPECT(lA.get_auto_dump() == false);
-
-  // Enabled by default
-  EXPECT(lB.get_auto_dump() == true);
-};
-
 UNITTEST("cuda stream place")
 {
   cudaStream_t user_stream;

From bc94c747768a2a3fd3b3cf09372f61a749c88e91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 26 Aug 2025 15:22:58 +0200
Subject: [PATCH 081/221] remove stf_logical_data_like_empty which is not
 designed properly yet

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  3 +-
 c/experimental/stf/src/stf.cu                 | 22 ++++-----
 .../experimental/stf/_stf_bindings_impl.pyx   | 48 +++++++++----------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 64260f14bc6..a6bb06353f0 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -158,7 +158,8 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
 void stf_logical_data_destroy(stf_logical_data_handle ld);
-void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to);
+// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle*
+// to);
 
 // TODO
 // void stf_logical_data_wait(stf_logical_data_handle ld);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 2a493a77528..46215e4bff3 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -75,17 +75,17 @@ void stf_logical_data_destroy(stf_logical_data_handle ld)
   delete ld;
 }
 
-void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* from, stf_logical_data_handle* to)
-{
-  assert(ctx);
-  assert(from);
-  assert(to);
-
-  auto ld_typed = ctx->ctx.logical_data(from->ld.shape());
-
-  // Stored in its untyped version
-  *to = new stf_logical_data_handle_t{ld_typed};
-}
+// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to)
+// {
+//   assert(ctx);
+//   assert(from);
+//   assert(to);
+//
+//   auto ld_typed = ctx->ctx.logical_data(from->ld.shape());
+//
+//   // Stored in its untyped version
+//   *to = new stf_logical_data_handle_t{ld_typed};
+// }
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 {
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 3c75758cfb7..542025b2f6d 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -116,7 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
     void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
     void stf_logical_data_destroy(stf_logical_data_handle ld)
-    void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst)
+#    void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst)
 
     ctypedef struct stf_task_handle_t
     ctypedef stf_task_handle_t* stf_task_handle
@@ -209,29 +209,29 @@ cdef class logical_data:
     def rw(self, dplace=None):
         return dep(self, AccessMode.RW.value, dplace)
 
-    def like_empty(self):
-        """
-        Create a new logical_data with the same shape (and dtype metadata)
-        as this object.
-        """
-        if self._ld == NULL:
-            raise RuntimeError("source logical_data handle is NULL")
-
-        cdef logical_data out = logical_data.__new__(logical_data)
-
-        out._ctx   = self._ctx
-        out._dtype = self._dtype
-        out._shape = self._shape
-        out._ndim  = self._ndim
-
-        cdef stf_logical_data_handle new_ld = NULL
-        stf_logical_data_like_empty(self._ctx._ctx, &self._ld, &new_ld)
-
-        if new_ld == NULL:
-            raise RuntimeError("stf_logical_data_like_empty returned NULL")
-
-        out._ld = new_ld
-        return out
+#    def like_empty(self):
+#        """
+#        Create a new logical_data with the same shape (and dtype metadata)
+#        as this object.
+#        """
+#        if self._ld == NULL:
+#            raise RuntimeError("source logical_data handle is NULL")
+#
+#        cdef logical_data out = logical_data.__new__(logical_data)
+#
+#        out._ctx   = self._ctx
+#        out._dtype = self._dtype
+#        out._shape = self._shape
+#        out._ndim  = self._ndim
+#
+#        cdef stf_logical_data_handle new_ld = NULL
+#        stf_logical_data_like_empty(self._ctx, &self._ld, &new_ld)
+#
+#        if new_ld == NULL:
+#            raise RuntimeError("stf_logical_data_like_empty returned NULL")
+#
+#        out._ld = new_ld
+#        return out
 
 class dep:
     __slots__ = ("ld", "mode", "dplace")

From 3e476481a437cba6bd5d8e05a79fd6b2506e6f91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 27 Aug 2025 14:37:39 +0200
Subject: [PATCH 082/221] Add a missing header

---
 c/parallel/src/nvrtc/command_list.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c/parallel/src/nvrtc/command_list.h b/c/parallel/src/nvrtc/command_list.h
index 3c4f89548f1..303b3f06d3d 100644
--- a/c/parallel/src/nvrtc/command_list.h
+++ b/c/parallel/src/nvrtc/command_list.h
@@ -16,6 +16,7 @@
 #include <iostream>
 #include <memory>
 #include <string_view>
+#include <variant>
 
 #include <nvrtc.h>
 

From f208979c38e7f742d66805242d53b63bb655cbfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 27 Aug 2025 18:11:22 +0200
Subject: [PATCH 083/221] Install in a place that depends on cuda version

---
 python/cuda_cccl/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 4872ce996b5..0b3f99edc54 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -66,9 +66,12 @@ install(
     DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl
 )
 
+
+file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl")
+
 install(
     TARGETS cccl.c.experimental.stf
-    DESTINATION cuda/cccl/experimental/stf/cccl
+    DESTINATION cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl
 )
 
 # Build and install Cython extension
@@ -144,6 +147,7 @@ add_custom_command(
     ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src}
     DEPENDS "${stf_pyx_source_file}"
     DEPFILE "${_stf_depfile}"
+    COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
 )
 set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE)
 add_custom_target(cythonize_stf_bindings_impl ALL

From 2ca0e3d75c7b0ff73256cc6974a580c07101c211 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 27 Aug 2025 18:11:56 +0200
Subject: [PATCH 084/221] fix pytest example

---
 python/cuda_cccl/tests/stf/test_numba.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index e4f4f66a0eb..73159f82119 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -34,13 +34,12 @@ def scale(a, x):
         x[i] = a * x[i]
 
 
-@pytest.mark.parametrize("use_graph_val", [False, True])
 def test_numba():
     X = np.ones(16, dtype=np.float32)
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)
 
-    ctx = context(use_graph=use_graph_val)
+    ctx = context(use_graph=True)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)

From 7cff926577fcf1bf7c815afdbc67a9ab58f9991a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 27 Aug 2025 19:16:25 +0200
Subject: [PATCH 085/221] Try to use an intermediate "shim" module to import
 cu12 or cu13 versions

---
 .../cuda/cccl/experimental/stf/__init__.py    |  2 +-
 .../cccl/experimental/stf/_stf_bindings.py    | 56 +++++++++++++++++++
 python/cuda_cccl/tests/stf/test_context.py    |  2 +-
 python/cuda_cccl/tests/stf/test_numba.py      |  2 +-
 4 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
index ce203e09097..873b31b7dcb 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
@@ -1,4 +1,4 @@
-from ._stf_bindings_impl import (
+from ._stf_bindings import (
     context,
     data_place,
     dep,
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py
new file mode 100644
index 00000000000..c61e908fc8d
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# _bindings.py is a shim module that imports symbols from a
+# _bindings_impl extension module. The shim serves two purposes:
+#
+# 1. Import a CUDA-specific extension. The cuda.cccl wheel ships with multiple
+#   extensions, one for each CUDA version. At runtime, this shim chooses the
+#   appropriate extension based on the detected CUDA version, and imports all
+#   symbols from it.
+#
+# 2. Preload `nvrtc` and `nvJitLink` before importing the extension.
+#   These shared libraries are indirect dependencies, pulled in via the direct
+#   dependency `cccl.c.parallel`. To ensure reliable symbol resolution at
+#   runtime, we explicitly load them first using `cuda.pathfinder`.
+#   Without this step, importing the Cython extension directly may fail or behave
+#   inconsistently depending on environment setup and dynamic linker behavior.
+#   This indirection ensures the right loading order, regardless of how
+#   `_bindings` is first imported across the codebase.
+
+import importlib
+
+from cuda.cccl._cuda_version_utils import detect_cuda_version, get_recommended_extra
+from cuda.pathfinder import (  # type: ignore[import-not-found]
+    load_nvidia_dynamic_lib,
+)
+
+
+def _load_cuda_libraries():
+    # Load appropriate libraries for the detected CUDA version
+    for libname in ("nvrtc", "nvJitLink"):
+        load_nvidia_dynamic_lib(libname)
+
+
+_load_cuda_libraries()
+
+
+# Import the appropriate bindings implementation depending on what
+# CUDA version is available:
+cuda_version = detect_cuda_version()
+if cuda_version not in [12, 13]:
+    raise RuntimeError(
+        f"Unsupported CUDA version: {cuda_version}. Only CUDA 12 and 13 are supported."
+    )
+
+try:
+    extra_name = get_recommended_extra(cuda_version)
+    bindings_module = importlib.import_module(
+        f".{extra_name}._stf_bindings_impl", __package__
+    )
+    # Import all symbols from the module
+    globals().update(bindings_module.__dict__)
+except ImportError as e:
+    raise ImportError(
+        f"Failed to import CUDA STF bindings for CUDA {cuda_version}. "
+    ) from e
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index 5a29199dc67..b306cf3571f 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import context, read, rw
+from cuda.cccl.experimental.stf._stf_bindings import context, read, rw
 
 
 def test_ctx():
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 73159f82119..11eed0b3749 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -10,7 +10,7 @@
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
-from cuda.cccl.experimental.stf._stf_bindings_impl import (
+from cuda.cccl.experimental.stf._stf_bindings import (
     context,
     data_place,
     exec_place,

From b8d89ed2638cf0294bc268b41d5148f4cf82911b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 27 Aug 2025 21:26:36 +0200
Subject: [PATCH 086/221] Fix tests (do not use graphs ...)

---
 python/cuda_cccl/tests/stf/test_decorator.py         | 6 +++++-
 python/cuda_cccl/tests/stf/test_numba.py             | 3 ++-
 python/cuda_cccl/tests/stf/test_stencil_decorator.py | 5 ++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 29fc61280eb..701b14a3574 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -1,8 +1,12 @@
 import numpy as np
+
+import numba
 from numba import cuda
 
-import cuda.cccl.experimental.stf as cudastf
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
+import cuda.cccl.experimental.stf as cudastf
 
 @cudastf.jit
 def axpy(a, x, y):
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 11eed0b3749..aa7afac7552 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -9,6 +9,7 @@
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
@@ -39,7 +40,7 @@ def test_numba():
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)
 
-    ctx = context(use_graph=True)
+    ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index ca3b68f2d17..c998ca4d00f 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -1,8 +1,11 @@
 import numpy as np
+import numba
 from numba import cuda
 
-import cuda.cccl.experimental.stf as cudastf
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
+import cuda.cccl.experimental.stf as cudastf
 
 @cudastf.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):

From 740dc86c051ceca63e7e5976ccb9025d0aeb5046 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 09:55:53 +0200
Subject: [PATCH 087/221] Introduce an API to enable graph capture with a low
 level graph_ctx task

---
 .../experimental/__stf/graph/graph_task.cuh   | 25 ++++++++++++++++++-
 .../experimental/__stf/internal/context.cuh   |  7 ++++++
 .../cuda/experimental/__stf/internal/task.cuh | 15 +++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 2cb5398c259..ac5f3e13bee 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -100,6 +100,14 @@ public:
         ready_dependencies.push_back(ge->node);
       }
     }
+    fprintf(stderr, "graph_task::start() end\n");
+
+    if (is_capture_enabled())
+    {
+      // Select a stream from the pool
+      capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
+    }
 
     return *this;
   }
@@ -109,6 +117,13 @@ public:
   {
     ::std::lock_guard<::std::mutex> lock(graph_mutex);
 
+    if (is_capture_enabled())
+    {
+      cudaGraph_t childGraph = nullptr;
+      cuda_safe_call(cudaStreamEndCapture(capture_stream, &childGraph));
+      set_child_graph(childGraph);
+    }
+
     cudaGraphNode_t n;
 
     auto done_prereqs = event_list();
@@ -273,6 +288,12 @@ public:
     return dot.is_timing() || (calibrate && statistics.is_calibrating());
   }
 
+  // Only valid if we have defined a capture stream
+  cudaStream_t get_stream() const
+  {
+    return capture_stream;
+  }
+
   /**
    * @brief Invokes a lambda that takes either a `cudaStream_t` or a `cudaGraph_t`. Dependencies must be
    * set with `add_deps` manually before this call.
@@ -337,7 +358,7 @@ public:
       //
 
       // Get a stream from the pool associated to the execution place
-      cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
 
       cudaGraph_t childGraph = nullptr;
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
@@ -455,6 +476,8 @@ private:
   cudaGraph_t child_graph       = nullptr;
   bool must_destroy_child_graph = false;
 
+  cudaStream_t capture_stream;
+
   /* If the task corresponds to independent graph nodes, we do not use a
    * child graph, but add nodes directly */
   ::std::vector<cudaGraphNode_t> task_nodes;
diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 53ab63b2ad2..56263ca4ead 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -243,6 +243,13 @@ public:
       return *this;
     }
 
+    void enable_capture()
+    {
+      payload->*[&](auto& self) {
+        self.enable_capture();
+      };
+    }
+
     /**
      * @brief Add dependencies to this task.
      *
diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh
index 4136d934a16..2b9945c4d81 100644
--- a/cudax/include/cuda/experimental/__stf/internal/task.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh
@@ -147,6 +147,10 @@ private:
     // affine data place of the execution place, but this can be a
     // composite data place when using a grid of places for example.
     data_place affine_data_place;
+
+    // Automatically capture work when this is a graph task (ignored with a
+    // CUDA stream backend).
+    bool enable_capture = false;
   };
 
 protected:
@@ -363,6 +367,17 @@ public:
     return ::std::hash<impl*>()(pimpl.get());
   }
 
+  void enable_capture()
+  {
+    fprintf(stderr, "task enable capture (generic task)\n");
+    pimpl->enable_capture = true;
+  }
+
+  bool is_capture_enabled() const
+  {
+    return pimpl->enable_capture;
+  }
+
   /**
    * @brief Start a task
    *

From 9687cbb25e8cb3044c3f3a96bb304875c966caba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 09:56:54 +0200
Subject: [PATCH 088/221] Enable graph capture when launching a numba kernel in
 the graph_ctx backend

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h    | 1 +
 c/experimental/stf/src/stf.cu                               | 6 ++++++
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx       | 4 ++++
 3 files changed, 11 insertions(+)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index a6bb06353f0..7542d156d08 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -179,6 +179,7 @@ void stf_task_end(stf_task_handle t);
 CUstream stf_task_get_custream(stf_task_handle t);
 void* stf_task_get(stf_task_handle t, int submitted_index);
 void stf_task_destroy(stf_task_handle t);
+void stf_task_enable_capture(stf_task_handle t);
 
 typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
 
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 46215e4bff3..1a27f3be858 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -190,6 +190,12 @@ void stf_task_end(stf_task_handle t)
   t->t.end();
 }
 
+void stf_task_enable_capture(stf_task_handle t)
+{
+  assert(t);
+  t->t.enable_capture();
+}
+
 CUstream stf_task_get_custream(stf_task_handle t)
 {
   assert(t);
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 542025b2f6d..5f1deb5754d 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -127,6 +127,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p)
     void stf_task_start(stf_task_handle t)
     void stf_task_end(stf_task_handle t)
+    void stf_task_enable_capture(stf_task_handle t)
     CUstream stf_task_get_custream(stf_task_handle t)
     # cudaStream_t stf_task_get_stream(stf_task_handle t)
     void* stf_task_get(stf_task_handle t, int submitted_index)
@@ -350,6 +351,9 @@ cdef class task:
 #        self._lds_args.clear()
 
     def start(self):
+        # This is ignored if this is not a graph task
+        stf_task_enable_capture(self._t)
+
         stf_task_start(self._t)
 
     def end(self):

From 5246b658321414505ee253bf3ee69ceefe2f184c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 09:58:59 +0200
Subject: [PATCH 089/221] Use a forked version of numba-cuda with work-arounds
 for CUDA graphs

---
 python/cuda_cccl/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 739ae920ea7..08e5a673540 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
   "numpy",
   "cuda-pathfinder>=1.1.0",
   "cuda-core",
-  "numba-cuda>=0.18.0",
+  "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory",
 ]
 
 dynamic = ["version"]

From 936bc60f69b4af9914a0a9229d197d8485ef71d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 09:59:54 +0200
Subject: [PATCH 090/221] fix formatting issues

---
 python/cuda_cccl/tests/stf/test_decorator.py         | 4 ++--
 python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 701b14a3574..25089684c75 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -1,6 +1,5 @@
-import numpy as np
-
 import numba
+import numpy as np
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -8,6 +7,7 @@
 
 import cuda.cccl.experimental.stf as cudastf
 
+
 @cudastf.jit
 def axpy(a, x, y):
     i = cuda.grid(1)
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index c998ca4d00f..fd845d67c64 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -1,5 +1,5 @@
-import numpy as np
 import numba
+import numpy as np
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1

From 7689834d4d0f5e8a2d12607ee664f06275311226 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 11:20:00 +0200
Subject: [PATCH 091/221] Do return a stream even in the graph_ctx when we are
 capturing

---
 .../include/cuda/experimental/__stf/internal/context.cuh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 56263ca4ead..93d59f27642 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -292,12 +292,9 @@ public:
 
     cudaStream_t get_stream() const
     {
-      if (auto p = ::std::get_if<stream_task<Deps...>>(&payload))
-      {
-        return p->get_stream();
-      }
-
-      return nullptr;
+      return payload->*[&](auto& self) {
+         return self.get_stream();
+      };
     }
 
   private:

From dde406dbda24b503e73dcff712b5dd800ee07fd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 11:21:29 +0200
Subject: [PATCH 092/221] test with graphs

---
 python/cuda_cccl/tests/stf/test_numba.py | 31 ++++++++++++++++++------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index aa7afac7552..9e0b7414438 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -35,20 +35,34 @@ def scale(a, x):
         x[i] = a * x[i]
 
 
-def test_numba():
+# One test with a single kernel in a CUDA graph
+def test_numba_graph():
     X = np.ones(16, dtype=np.float32)
-    Y = np.ones(16, dtype=np.float32)
-    Z = np.ones(16, dtype=np.float32)
+    ctx = context(use_graph=True)
+    lX = ctx.logical_data(X)
+    with ctx.task(rw(lX)) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = t.get_arg_numba(0)
+        scale[32, 64, nb_stream](2.0, dX)
+        pass
+    ctx.finalize()
 
-    ctx = context()
+
+def test_numba():
+    n=1024*1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = context(use_graph=True)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
     with ctx.task(rw(lX)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        # dX = t.get_arg_numba(0)
-        dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+        dX = t.get_arg_numba(0)
+        # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
         pass
 
@@ -74,6 +88,8 @@ def test_numba():
         axpy[32, 64, nb_stream](2.0, dY, dZ)
         pass
 
+    ctx.finalize()
+
 
 @cuda.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):
@@ -239,4 +255,5 @@ def test_numba_places():
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    test_numba_exec_place()
+    # test_numba_graph()
+    test_numba()

From 75630141ffc8562f47a042b179b29bba798f28b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 11:53:06 +0200
Subject: [PATCH 093/221] parametrized tests

---
 python/cuda_cccl/tests/stf/test_decorator.py | 34 +++++++++++---------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 25089684c75..f36017d9883 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -1,5 +1,7 @@
 import numba
 import numpy as np
+import pytest
+
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -22,18 +24,20 @@ def scale(a, x):
         x[i] = a * x[i]
 
 
-X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
-
-ctx = cudastf.context()
-lX = ctx.logical_data(X)
-lY = ctx.logical_data(Y)
-lZ = ctx.logical_data(Z)
-
-scale[32, 64, ctx](2.0, lX.rw())
-axpy[32, 64, ctx](2.0, lX.read(), lY.rw())  # default device
-axpy[32, 64, ctx, cudastf.exec_place.device(0)](
-    2.0, lX.read(), lZ.rw()
-)  # explicit exec place
-axpy[32, 64, ctx](
-    2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))
-)  # per-dep placement override
+@pytest.mark.parametrize("use_graph", [True, False])
+def test_decorator(use_graph):
+    X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
+    
+    ctx = cudastf.context(use_graph=use_graph)
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+    
+    scale[32, 64, ctx](2.0, lX.rw())
+    axpy[32, 64, ctx](2.0, lX.read(), lY.rw())  # default device
+    axpy[32, 64, ctx, cudastf.exec_place.device(0)](
+        2.0, lX.read(), lZ.rw()
+    )  # explicit exec place
+    axpy[32, 64, ctx](
+        2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))
+    )  # per-dep placement override

From b094c27275ade18068dc526ef88ae80b9c577c8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 12:04:06 +0200
Subject: [PATCH 094/221] test that we get a stream in graph_task when
 capturing

---
 .../cuda/experimental/__stf/internal/context.cuh       | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 93d59f27642..8b50f924f65 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -1574,8 +1574,18 @@ UNITTEST("get_stream graph")
   auto t     = ctx.task(token.write());
   t.start();
   cudaStream_t s = t.get_stream();
+  // We are not capturing so there is no stream associated
   EXPECT(s == nullptr);
   t.end();
+
+  auto t2     = ctx.task(token.write());
+  t2.enable_capture();
+  t2.start();
+  cudaStream_t s = t2.get_stream();
+  // We are capturing so the stream used for capture is associated to the task
+  EXPECT(s != nullptr);
+  t2.end();
+
   ctx.finalize();
 };
 

From 222c21608d6f15b1f13736ddbbe51b0004cdd333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 14:14:29 +0200
Subject: [PATCH 095/221] Save WIP: add a mockup of FHE example, which needs a
 like_empty method

---
 python/cuda_cccl/tests/stf/test_fhe.py | 152 +++++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_fhe.py

diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
new file mode 100644
index 00000000000..0fead4fb000
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# A toy example to illustrate how we can compose logical operations
+
+import numba
+import numpy as np
+import pytest
+from numba import cuda
+
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+from cuda.cccl.experimental.stf._stf_bindings import (
+    context,
+    read,
+    rw,
+    write,
+)
+
+class Plaintext:
+    def __init__(self, ctx, values=None, ld=None):
+        self.ctx = ctx
+        if not ld is None:
+           self.l = ld
+        if not values is None:
+           self.values = bytearray(values)
+           self.l = ctx.logical_data(self.values)
+        self.symbol = None
+
+    def set_symbol(self, symbol: str):
+        self.l.set_symbol(symbol)
+        self.symbol = symbol
+
+    def convert_to_vector(self) -> bytearray:
+        result = bytearray(self.l.buffer)
+        return result
+
+    def encrypt(self) -> "Ciphertext":
+        # stub: should return a Ciphertext object wrapping a LogicalData
+        encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
+        return Ciphertext(self.ctx, encrypted)
+
+@cuda.jit
+def and_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] & b[i]
+
+@cuda.jit
+def or_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] | b[i]
+
+@cuda.jit
+def not_kernel(a, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = ~a[i]
+
+class Ciphertext:
+    def __init__(self, ctx, values=None, ld=None):
+        self.ctx = ctx
+        if not ld is None:
+           self.l = ld
+        if values is not None:
+           self.values = bytearray(values)
+           self.l = ctx.logical_data(self.values)
+        self.symbol = None
+
+    # ~ operator
+    def __invert__(self):
+        result=Ciphertext(ctx, ld=self.l)
+        # result=Ciphertext(ctx, ld=self.l.like_empty())
+
+        with ctx.task(self.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da = t.get_arg_numba(0)
+            dresult = t.get_arg_numba(1)
+            not_kernel[32, 16, nb_stream](da, dresult)
+
+        return result
+
+    # | operator
+    def __or__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+
+        result=Ciphertext(ctx, ld=self.l)
+        # result=Ciphertext(ctx, ld=self.l.like_empty())
+
+        with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da = t.get_arg_numba(0)
+            db = t.get_arg_numba(1)
+            dresult = t.get_arg_numba(2)
+            or_kernel[32, 16, nb_stream](da, db, dresult)
+
+        return result
+
+
+    # & operator
+    def __and__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+
+        result=Ciphertext(ctx, ld=self.l)
+        # result=Ciphertext(ctx, ld=self.l.like_empty())
+
+        with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da = t.get_arg_numba(0)
+            db = t.get_arg_numba(1)
+            dresult = t.get_arg_numba(2)
+            and_kernel[32, 16, nb_stream](da, db, dresult)
+
+        return result
+
+    def set_symbol(self, symbol: str):
+        self.l.set_symbol(symbol)
+        self.symbol = symbol
+
+    def decrypt(self):
+        # reverse the toy XOR "encryption"
+        decrypted = bytearray([c ^ 0x42 for c in self.values])
+        return Plaintext(self.ctx, decrypted)
+
+def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
+    return (~((eA | ~eB) & (~eA | eB)))
+
+ctx = context(use_graph=False)
+
+vA = [3, 3, 2, 2, 17]
+pA = Plaintext(ctx, vA)
+pA.set_symbol("A")
+
+vB = [1, 7, 7, 7, 49]
+pB = Plaintext(ctx, vB)
+pB.set_symbol("B")
+
+eA = pA.encrypt()
+eB = pB.encrypt()
+out = circuit(eA, eB)
+
+ctx.finalize()
+
+# v_out = out.decrypt().values
+# print("Output vector:", list(v_out))
+
+

From b04cebf6cd4c8b7484e0e71e55e1bf3222adc141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 15:49:54 +0200
Subject: [PATCH 096/221] Implement like_empty

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  3 +-
 c/experimental/stf/src/stf.cu                 |  9 +++
 .../experimental/stf/_stf_bindings_impl.pyx   | 59 +++++++++++--------
 python/cuda_cccl/tests/stf/test_fhe.py        | 11 ++--
 4 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 7542d156d08..6b05a18b158 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -158,8 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
 void stf_logical_data_destroy(stf_logical_data_handle ld);
-// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle*
-// to);
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to);
 
 // TODO
 // void stf_logical_data_wait(stf_logical_data_handle ld);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 1a27f3be858..e5a7e7368ec 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -75,6 +75,15 @@ void stf_logical_data_destroy(stf_logical_data_handle ld)
   delete ld;
 }
 
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
+{
+   assert(ctx);
+   assert(to);
+
+   auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
+   *to = new stf_logical_data_handle_t{ld_typed};
+}
+
 // void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to)
 // {
 //   assert(ctx);
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 5f1deb5754d..e8088183ae0 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -116,7 +116,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
     void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
     void stf_logical_data_destroy(stf_logical_data_handle ld)
-#    void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle* src, stf_logical_data_handle* dst)
+    void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
 
     ctypedef struct stf_task_handle_t
     ctypedef stf_task_handle_t* stf_task_handle
@@ -162,20 +162,35 @@ class stf_arg_cai:
 
 cdef class logical_data:
     cdef stf_logical_data_handle _ld
+    cdef stf_ctx_handle _ctx
 
     cdef object _dtype
     cdef tuple  _shape
     cdef int    _ndim
+    cdef size_t _len
+
+    def __cinit__(self, context ctx=None, object buf=None):
+        if ctx is None or buf is None:
+            # allow creation via __new__ (eg. in like_empty)
+            self._ld = NULL
+            self._ctx = NULL
+            self._len = 0
+            self._dtype = None
+            self._shape = ()
+            self._ndim = 0
+            return
 
-    def __cinit__(self, context ctx, object buf):
         cdef Py_buffer view
         cdef int flags = PyBUF_FORMAT | PyBUF_ND          # request dtype + shape
 
+        self._ctx = ctx._ctx
+
         if PyObject_GetBuffer(buf, &view, flags) != 0:
             raise ValueError("object doesn’t support the full buffer protocol")
 
         try:
             self._ndim  = view.ndim
+            self._len = view.len
             self._shape = tuple(<Py_ssize_t>view.shape[i] for i in range(view.ndim))
             self._dtype = np.dtype(view.format)
             stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len)
@@ -210,29 +225,23 @@ cdef class logical_data:
     def rw(self, dplace=None):
         return dep(self, AccessMode.RW.value, dplace)
 
-#    def like_empty(self):
-#        """
-#        Create a new logical_data with the same shape (and dtype metadata)
-#        as this object.
-#        """
-#        if self._ld == NULL:
-#            raise RuntimeError("source logical_data handle is NULL")
-#
-#        cdef logical_data out = logical_data.__new__(logical_data)
-#
-#        out._ctx   = self._ctx
-#        out._dtype = self._dtype
-#        out._shape = self._shape
-#        out._ndim  = self._ndim
-#
-#        cdef stf_logical_data_handle new_ld = NULL
-#        stf_logical_data_like_empty(self._ctx, &self._ld, &new_ld)
-#
-#        if new_ld == NULL:
-#            raise RuntimeError("stf_logical_data_like_empty returned NULL")
-#
-#        out._ld = new_ld
-#        return out
+    def like_empty(self):
+        """
+        Create a new logical_data with the same shape (and dtype metadata)
+        as this object.
+        """
+        if self._ld == NULL:
+            raise RuntimeError("source logical_data handle is NULL")
+
+        cdef logical_data out = logical_data.__new__(logical_data)
+        stf_logical_data_empty(self._ctx, self._len, &out._ld)
+        out._ctx   = self._ctx
+        out._dtype = self._dtype
+        out._shape = self._shape
+        out._ndim  = self._ndim
+        out._len   = self._len
+
+        return out
 
 class dep:
     __slots__ = ("ld", "mode", "dplace")
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 0fead4fb000..0b17d1d791f 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -40,7 +40,7 @@ def convert_to_vector(self) -> bytearray:
     def encrypt(self) -> "Ciphertext":
         # stub: should return a Ciphertext object wrapping a LogicalData
         encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
-        return Ciphertext(self.ctx, encrypted)
+        return Ciphertext(self.ctx, values=encrypted)
 
 @cuda.jit
 def and_kernel(a, b, out):
@@ -72,8 +72,7 @@ def __init__(self, ctx, values=None, ld=None):
 
     # ~ operator
     def __invert__(self):
-        result=Ciphertext(ctx, ld=self.l)
-        # result=Ciphertext(ctx, ld=self.l.like_empty())
+        result=Ciphertext(ctx, values=None, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -88,8 +87,7 @@ def __or__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result=Ciphertext(ctx, ld=self.l)
-        # result=Ciphertext(ctx, ld=self.l.like_empty())
+        result=Ciphertext(ctx, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -106,8 +104,7 @@ def __and__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result=Ciphertext(ctx, ld=self.l)
-        # result=Ciphertext(ctx, ld=self.l.like_empty())
+        result=Ciphertext(ctx, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())

From 9ed5ace8e61c1e2d34fe05c1f97b09a7b442a5df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 16:16:14 +0200
Subject: [PATCH 097/221] More comprehensive FHE test

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  2 +-
 c/experimental/stf/src/stf.cu                 | 10 +--
 .../experimental/__stf/internal/context.cuh   |  4 +-
 python/cuda_cccl/tests/stf/test_decorator.py  |  5 +-
 python/cuda_cccl/tests/stf/test_fhe.py        | 85 +++++++++++--------
 python/cuda_cccl/tests/stf/test_numba.py      |  4 +-
 .../tests/stf/test_stencil_decorator.py       |  1 +
 7 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 6b05a18b158..6f2f903e6c8 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -158,7 +158,7 @@ typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
 void stf_logical_data_destroy(stf_logical_data_handle ld);
-void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to);
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to);
 
 // TODO
 // void stf_logical_data_wait(stf_logical_data_handle ld);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index e5a7e7368ec..b82afed3526 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -75,13 +75,13 @@ void stf_logical_data_destroy(stf_logical_data_handle ld)
   delete ld;
 }
 
-void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to)
 {
-   assert(ctx);
-   assert(to);
+  assert(ctx);
+  assert(to);
 
-   auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
-   *to = new stf_logical_data_handle_t{ld_typed};
+  auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
+  *to           = new stf_logical_data_handle_t{ld_typed};
 }
 
 // void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to)
diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 8b50f924f65..05950afcd5e 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -293,7 +293,7 @@ public:
     cudaStream_t get_stream() const
     {
       return payload->*[&](auto& self) {
-         return self.get_stream();
+        return self.get_stream();
       };
     }
 
@@ -1578,7 +1578,7 @@ UNITTEST("get_stream graph")
   EXPECT(s == nullptr);
   t.end();
 
-  auto t2     = ctx.task(token.write());
+  auto t2 = ctx.task(token.write());
   t2.enable_capture();
   t2.start();
   cudaStream_t s = t2.get_stream();
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index f36017d9883..79a198d1c4c 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -1,7 +1,6 @@
 import numba
 import numpy as np
 import pytest
-
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -27,12 +26,12 @@ def scale(a, x):
 @pytest.mark.parametrize("use_graph", [True, False])
 def test_decorator(use_graph):
     X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
-    
+
     ctx = cudastf.context(use_graph=use_graph)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
-    
+
     scale[32, 64, ctx](2.0, lX.rw())
     axpy[32, 64, ctx](2.0, lX.read(), lY.rw())  # default device
     axpy[32, 64, ctx, cudastf.exec_place.device(0)](
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 0b17d1d791f..e34bfd42834 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -5,8 +5,6 @@
 # A toy example to illustrate how we can compose logical operations
 
 import numba
-import numpy as np
-import pytest
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -14,65 +12,78 @@
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
-    read,
-    rw,
-    write,
+    data_place,
+    exec_place,
 )
 
+
 class Plaintext:
+    # Initialize from actual values, or from a logical data
     def __init__(self, ctx, values=None, ld=None):
         self.ctx = ctx
-        if not ld is None:
-           self.l = ld
-        if not values is None:
-           self.values = bytearray(values)
-           self.l = ctx.logical_data(self.values)
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values)
         self.symbol = None
 
     def set_symbol(self, symbol: str):
         self.l.set_symbol(symbol)
         self.symbol = symbol
 
-    def convert_to_vector(self) -> bytearray:
-        result = bytearray(self.l.buffer)
-        return result
-
     def encrypt(self) -> "Ciphertext":
-        # stub: should return a Ciphertext object wrapping a LogicalData
         encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
         return Ciphertext(self.ctx, values=encrypted)
 
+    def print_values(self):
+        with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            hvalues = t.get_arg_numba(0)
+            print([v for v in hvalues])
+
+
 @cuda.jit
 def and_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
         out[i] = a[i] & b[i]
 
+
 @cuda.jit
 def or_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
         out[i] = a[i] | b[i]
 
+
 @cuda.jit
 def not_kernel(a, out):
     i = cuda.grid(1)
     if i < out.size:
         out[i] = ~a[i]
 
+
+@cuda.jit
+def xor_kernel(a, out, v):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] ^ v
+
+
 class Ciphertext:
     def __init__(self, ctx, values=None, ld=None):
         self.ctx = ctx
-        if not ld is None:
-           self.l = ld
+        if ld is not None:
+            self.l = ld
         if values is not None:
-           self.values = bytearray(values)
-           self.l = ctx.logical_data(self.values)
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values)
         self.symbol = None
 
-    # ~ operator
+    # ~ operator
     def __invert__(self):
-        result=Ciphertext(ctx, values=None, ld=self.l.like_empty())
+        result = Ciphertext(ctx, values=None, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -82,12 +93,12 @@ def __invert__(self):
 
         return result
 
-    # | operator
+    # | operator
     def __or__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result=Ciphertext(ctx, ld=self.l.like_empty())
+        result = Ciphertext(ctx, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -98,16 +109,16 @@ def __or__(self, other):
 
         return result
 
-
-    # & operator
+    # & operator
     def __and__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result=Ciphertext(ctx, ld=self.l.like_empty())
+        result = Ciphertext(ctx, ld=self.l.like_empty())
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
+            nb_stream.synchronize()
             da = t.get_arg_numba(0)
             db = t.get_arg_numba(1)
             dresult = t.get_arg_numba(2)
@@ -120,12 +131,20 @@ def set_symbol(self, symbol: str):
         self.symbol = symbol
 
     def decrypt(self):
-        # reverse the toy XOR "encryption"
-        decrypted = bytearray([c ^ 0x42 for c in self.values])
-        return Plaintext(self.ctx, decrypted)
+        result = Ciphertext(ctx, ld=self.l.like_empty())
+        with ctx.task(self.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da = t.get_arg_numba(0)
+            dresult = t.get_arg_numba(1)
+            # reverse the toy XOR "encryption"
+            xor_kernel[32, 16, nb_stream](da, dresult, 0x42)
+
+        return Plaintext(self.ctx, ld=result.l)
+
 
 def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
-    return (~((eA | ~eB) & (~eA | eB)))
+    return ~((eA | ~eB) & (~eA | eB))
+
 
 ctx = context(use_graph=False)
 
@@ -141,9 +160,5 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
 eB = pB.encrypt()
 out = circuit(eA, eB)
 
+out.decrypt().print_values()
 ctx.finalize()
-
-# v_out = out.decrypt().values
-# print("Output vector:", list(v_out))
-
-
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 9e0b7414438..a52276295ac 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -49,7 +49,7 @@ def test_numba_graph():
 
 
 def test_numba():
-    n=1024*1024
+    n = 1024 * 1024
     X = np.ones(n, dtype=np.float32)
     Y = np.ones(n, dtype=np.float32)
     Z = np.ones(n, dtype=np.float32)
@@ -255,5 +255,5 @@ def test_numba_places():
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    # test_numba_graph()
+    # test_numba_graph()
     test_numba()
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index fd845d67c64..c20414190db 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -7,6 +7,7 @@
 
 import cuda.cccl.experimental.stf as cudastf
 
+
 @cudastf.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):
     """

From e27ef5b75fca980d4ff8afc0dc3e90157a5346e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 16:31:36 +0200
Subject: [PATCH 098/221] test fhe with stf decorator

---
 .../cuda_cccl/tests/stf/test_fhe_decorator.py | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_fhe_decorator.py

diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
new file mode 100644
index 00000000000..ae8c6734f5e
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# A toy example to illustrate how we can compose logical operations
+
+import numba
+from numba import cuda
+
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+import cuda.cccl.experimental.stf as cudastf
+
+from cuda.cccl.experimental.stf._stf_bindings import (
+    context,
+    data_place,
+    exec_place,
+)
+
+
+class Plaintext:
+    # Initialize from actual values, or from a logical data
+    def __init__(self, ctx, values=None, ld=None):
+        self.ctx = ctx
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values)
+        self.symbol = None
+
+    def set_symbol(self, symbol: str):
+        self.l.set_symbol(symbol)
+        self.symbol = symbol
+
+    def encrypt(self) -> "Ciphertext":
+        encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
+        return Ciphertext(self.ctx, values=encrypted)
+
+    def print_values(self):
+        with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            hvalues = t.get_arg_numba(0)
+            print([v for v in hvalues])
+
+
+@cudastf.jit
+def and_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] & b[i]
+
+
+@cudastf.jit
+def or_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] | b[i]
+
+
+@cudastf.jit
+def not_kernel(a, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = ~a[i]
+
+
+@cudastf.jit
+def xor_kernel(a, out, v):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] ^ v
+
+
+class Ciphertext:
+    def __init__(self, ctx, values=None, ld=None):
+        self.ctx = ctx
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values)
+        self.symbol = None
+
+    # ~ operator
+    def __invert__(self):
+        result = Ciphertext(ctx, values=None, ld=self.l.like_empty())
+
+        not_kernel[32, 16, ctx](self.l.read(), result.l.write())
+
+        return result
+
+    # | operator
+    def __or__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+
+        result = Ciphertext(ctx, ld=self.l.like_empty())
+        or_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write())
+
+        return result
+
+    # & operator
+    def __and__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+
+        result = Ciphertext(ctx, ld=self.l.like_empty())
+        and_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write())
+
+        return result
+
+    def set_symbol(self, symbol: str):
+        self.l.set_symbol(symbol)
+        self.symbol = symbol
+
+    def decrypt(self):
+        result = Ciphertext(ctx, ld=self.l.like_empty())
+        xor_kernel[32, 16, ctx](self.l.read(), result.l.write(), 0x42)
+
+        return Plaintext(self.ctx, ld=result.l)
+
+
+def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
+    return ~((eA | ~eB) & (~eA | eB))
+
+
+ctx = context(use_graph=False)
+
+vA = [3, 3, 2, 2, 17]
+pA = Plaintext(ctx, vA)
+pA.set_symbol("A")
+
+vB = [1, 7, 7, 7, 49]
+pB = Plaintext(ctx, vB)
+pB.set_symbol("B")
+
+eA = pA.encrypt()
+eB = pB.encrypt()
+out = circuit(eA, eB)
+
+out.decrypt().print_values()
+ctx.finalize()

From 6963ec0d97ba605896002a2ea50a21dc1def82bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 28 Aug 2025 21:49:04 +0200
Subject: [PATCH 099/221] fix merge error

---
 .../include/cuda/experimental/__stf/graph/graph_task.cuh  | 8 --------
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx     | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 10ce8628d88..4896faa28c2 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -100,14 +100,6 @@ public:
         ready_dependencies.push_back(ge->node);
       }
     }
-    fprintf(stderr, "graph_task::start() end\n");
-
-    if (is_capture_enabled())
-    {
-      // Select a stream from the pool
-      capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
-      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
-    }
 
     if (is_capture_enabled())
     {
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index e8088183ae0..c13ad04c233 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -403,6 +403,14 @@ cdef class task:
         cdef CUstream s = stf_task_get_custream(self._t)
         return <uintptr_t> s         # cast pointer -> Py int
 
+    def stream_cdata(self):
+        """
+        Return the raw CUstream as a ctypes void pointer.
+        This can be passed directly to torch.cuda.Stream(cdata=...).
+        """
+        cdef CUstream s = stf_task_get_custream(self._t)
+        return ctypes.c_void_p(<uintptr_t> s)
+
     def get_arg(self, index) -> int:
         cdef void *ptr = stf_task_get(self._t, index)
         return <uintptr_t>ptr

From 06fab11a074df6f98563fa8baee0e816d0fb8234 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 09:58:53 +0200
Subject: [PATCH 100/221] Appropriate checks

---
 c/experimental/stf/src/stf.cu | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index b82afed3526..4a9cb1815c8 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -24,18 +24,14 @@ struct stf_task_handle_t
 
 void stf_ctx_create(stf_ctx_handle* ctx)
 {
-  if (ctx)
-  {
-    *ctx = new stf_ctx_handle_t{context{}};
-  }
+  assert(ctx)
+  *ctx = new stf_ctx_handle_t{context{}};
 }
 
 void stf_ctx_create_graph(stf_ctx_handle* ctx)
 {
-  if (ctx)
-  {
-    *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
-  }
+  assert(ctx)
+  *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
 }
 
 void stf_ctx_finalize(stf_ctx_handle ctx)

From 2fc802e35b415eed01b77759e447ebfd4a00c080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 10:42:30 +0200
Subject: [PATCH 101/221] Add missing ;

---
 c/experimental/stf/src/stf.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 4a9cb1815c8..d2abedc66d6 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -24,13 +24,13 @@ struct stf_task_handle_t
 
 void stf_ctx_create(stf_ctx_handle* ctx)
 {
-  assert(ctx)
+  assert(ctx);
   *ctx = new stf_ctx_handle_t{context{}};
 }
 
 void stf_ctx_create_graph(stf_ctx_handle* ctx)
 {
-  assert(ctx)
+  assert(ctx);
   *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
 }
 

From a43db62a8901558095f61dfe240c416a3d14573c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:14:32 +0200
Subject: [PATCH 102/221] - Make it possible to create a borrowed context from
 a handle - Infer the context in the decorator from dependencies if possible

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 45 +++++++++++++++----
 .../cuda/cccl/experimental/stf/decorator.py   | 45 +++++++++++++------
 python/cuda_cccl/tests/stf/test_decorator.py  | 11 +++--
 .../cuda_cccl/tests/stf/test_fhe_decorator.py |  1 -
 4 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index c13ad04c233..e2c9fa29e26 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -243,6 +243,11 @@ cdef class logical_data:
 
         return out
 
+    def borrow_ctx_handle(self):
+        ctx = context(borrowed=True)
+        ctx.borrow_from_handle(self._ctx)
+        return ctx
+
 class dep:
     __slots__ = ("ld", "mode", "dplace")
     def __init__(self, logical_data ld, int mode, dplace=None):
@@ -255,6 +260,8 @@ class dep:
         yield self.dplace
     def __repr__(self):
         return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})"
+    def get_ld(self):
+        return self.ld
 
 def read(ld, dplace=None):   return dep(ld, AccessMode.READ.value, dplace)
 def write(ld, dplace=None):  return dep(ld, AccessMode.WRITE.value, dplace)
@@ -437,20 +444,42 @@ cdef class task:
 
 cdef class context:
     cdef stf_ctx_handle _ctx
+    # Is this a context that we have borrowed ?
+    cdef bint _borrowed
+
+    def __cinit__(self, bint use_graph=False, bint borrowed=False):
+        self._ctx = <stf_ctx_handle>NULL
+        self._borrowed = borrowed
+        if not borrowed:
+            if use_graph:
+                stf_ctx_create_graph(&self._ctx)
+            else:
+                stf_ctx_create(&self._ctx)
 
-    def __cinit__(self, bint use_graph=False):
-        if use_graph:
-            stf_ctx_create_graph(&self._ctx)
-        else:
-            stf_ctx_create(&self._ctx)
+    cdef borrow_from_handle(self, stf_ctx_handle ctx_handle):
+        if not self._ctx == NULL:
+            raise RuntimeError("context already initialized")
+
+        if not self._borrowed:
+            raise RuntimeError("cannot call borrow_from_handle on this context")
+
+        self._ctx = ctx_handle
+        print(f"borrowing ... new ctx handle = {<int>ctx_handle} self={self}")
+
+    def __repr__(self):
+        return f"context(handle={<int>self._ctx}, borrowed={self._borrowed})"
 
     def __dealloc__(self):
-        self.finalize()
+        if not self._borrowed:
+            self.finalize()
 
     def finalize(self):
+        if self._borrowed:
+            raise RuntimeError("cannot finalize borrowed context")
+
         if self._ctx != NULL:
-            stf_ctx_finalize(self._ctx)
-            self._ctx = NULL
+                stf_ctx_finalize(self._ctx)
+        self._ctx = NULL
 
     def logical_data(self, object buf):
         """
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 42dfc5b774a..7fb5e1a0337 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -12,26 +12,38 @@ def __init__(self, pyfunc, jit_args, jit_kwargs):
         self._jit_args = jit_args
         self._jit_kwargs = jit_kwargs
         self._compiled_kernel = None
-        self._launch_cfg = None  # (gridDim, blockDim, context, exec_place?)
+        # (grid_dim, block_dim, ctx_or_none, exec_place_or_none)
+        self._launch_cfg = None
 
     def __getitem__(self, cfg):
-        if not (len(cfg) == 3 or len(cfg) == 4):
-            raise TypeError("use kernel[gridDim, blockDim, ctx (, exec_place)]")
+        # Normalize cfg into (grid_dim, block_dim, ctx, exec_pl)
+        if not (isinstance(cfg, tuple) or isinstance(cfg, list)):
+            raise TypeError("use kernel[grid, block (, ctx [, exec_place])]")
+        n = len(cfg)
+        if n not in (2, 3, 4):
+            raise TypeError(
+                "use kernel[grid, block], kernel[grid, block, ctx], or kernel[grid, block, ctx, exec_place]"
+            )
+
+        grid_dim = cfg[0]
+        block_dim = cfg[1]
+        ctx = None
+        exec_pl = None
+
+        if n >= 3:
+            ctx = cfg[2]
+
+        if n == 4:
+            exec_pl = cfg[3]
 
-        gridDim, blockDim, ctx, *rest = cfg
-        if not isinstance(ctx, context):
-            raise TypeError("3rd item must be an STF context")
+        # Type checks (ctx can be None; exec_pl can be None)
+        if ctx is not None and not isinstance(ctx, context):
+            raise TypeError("3rd item must be an STF context (or None to infer)")
 
-        exec_pl = rest[0] if rest else None
-        if exec_pl and not isinstance(exec_pl, exec_place):
+        if exec_pl is not None and not isinstance(exec_pl, exec_place):
             raise TypeError("4th item must be an exec_place")
 
-        self._launch_cfg = (
-            tuple(gridDim) if isinstance(gridDim, tuple) else (int(gridDim),),
-            tuple(blockDim) if isinstance(blockDim, tuple) else (int(blockDim),),
-            ctx,
-            exec_pl,
-        )
+        self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl)
 
         return self
 
@@ -47,6 +59,11 @@ def __call__(self, *args, **kwargs):
         for i, a in enumerate(args):
             print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
+                if ctx == None:
+                    ld = a.get_ld()
+                    # This context will be used in the __call__ method itself
+                    # so we can create a temporary object from the handle
+                    ctx = ld.borrow_ctx_handle()
                 dep_items.append((i, a))
 
         task_args = [exec_pl] if exec_pl else []
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 79a198d1c4c..7a6ab7f3378 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -32,11 +32,16 @@ def test_decorator(use_graph):
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
-    scale[32, 64, ctx](2.0, lX.rw())
-    axpy[32, 64, ctx](2.0, lX.read(), lY.rw())  # default device
+    scale[32, 64](2.0, lX.rw())
+    axpy[32, 64](2.0, lX.read(), lY.rw())
     axpy[32, 64, ctx, cudastf.exec_place.device(0)](
         2.0, lX.read(), lZ.rw()
     )  # explicit exec place
-    axpy[32, 64, ctx](
+    axpy[32, 64](
         2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))
     )  # per-dep placement override
+
+
+if __name__ == "__main__":
+    print("Running CUDASTF examples...")
+    test_decorator(False)
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index ae8c6734f5e..d80733249fb 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -11,7 +11,6 @@
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 import cuda.cccl.experimental.stf as cudastf
-
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
     data_place,

From 9c0767916be1879c477a4bc82f2736d75b694926 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:19:17 +0200
Subject: [PATCH 103/221] invert ctx and exec place in the decorator

---
 .../cuda/cccl/experimental/stf/decorator.py   | 19 ++++++++++---------
 python/cuda_cccl/tests/stf/test_decorator.py  |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 7fb5e1a0337..eaf07a12610 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -12,17 +12,17 @@ def __init__(self, pyfunc, jit_args, jit_kwargs):
         self._jit_args = jit_args
         self._jit_kwargs = jit_kwargs
         self._compiled_kernel = None
-        # (grid_dim, block_dim, ctx_or_none, exec_place_or_none)
+        # (grid_dim, block_dim, exec_place_or_none, ctx_or_none)
         self._launch_cfg = None
 
     def __getitem__(self, cfg):
-        # Normalize cfg into (grid_dim, block_dim, ctx, exec_pl)
+        # Normalize cfg into (grid_dim, block_dim, exec_pl, ctx)
         if not (isinstance(cfg, tuple) or isinstance(cfg, list)):
-            raise TypeError("use kernel[grid, block (, ctx [, exec_place])]")
+            raise TypeError("use kernel[grid, block ([, exec_place, ctx])]")
         n = len(cfg)
         if n not in (2, 3, 4):
             raise TypeError(
-                "use kernel[grid, block], kernel[grid, block, ctx], or kernel[grid, block, ctx, exec_place]"
+                "use kernel[grid, block], kernel[grid, block, exec_place], or kernel[grid, block, exec_place, ctx]"
             )
 
         grid_dim = cfg[0]
@@ -31,17 +31,18 @@ def __getitem__(self, cfg):
         exec_pl = None
 
         if n >= 3:
-            ctx = cfg[2]
+            exec_pl = cfg[2]
 
         if n == 4:
-            exec_pl = cfg[3]
+            ctx = cfg[3]
+
+        if exec_pl is not None and not isinstance(exec_pl, exec_place):
+            raise TypeError("3rd item must be an exec_place")
 
         # Type checks (ctx can be None; exec_pl can be None)
         if ctx is not None and not isinstance(ctx, context):
-            raise TypeError("3rd item must be an STF context (or None to infer)")
+            raise TypeError("4th item must be an STF context (or None to infer)")
 
-        if exec_pl is not None and not isinstance(exec_pl, exec_place):
-            raise TypeError("4th item must be an exec_place")
 
         self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl)
 
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 7a6ab7f3378..49605ced878 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -34,7 +34,7 @@ def test_decorator(use_graph):
 
     scale[32, 64](2.0, lX.rw())
     axpy[32, 64](2.0, lX.read(), lY.rw())
-    axpy[32, 64, ctx, cudastf.exec_place.device(0)](
+    axpy[32, 64, cudastf.exec_place.device(0)](
         2.0, lX.read(), lZ.rw()
     )  # explicit exec place
     axpy[32, 64](

From 947bbcc513de7397fc0c9b36cf4b3b73ba53469e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:20:45 +0200
Subject: [PATCH 104/221] fix decorator api

---
 python/cuda_cccl/tests/stf/test_fhe_decorator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index d80733249fb..024586f743c 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -86,7 +86,7 @@ def __init__(self, ctx, values=None, ld=None):
     def __invert__(self):
         result = Ciphertext(ctx, values=None, ld=self.l.like_empty())
 
-        not_kernel[32, 16, ctx](self.l.read(), result.l.write())
+        not_kernel[32, 16](self.l.read(), result.l.write())
 
         return result
 
@@ -96,7 +96,7 @@ def __or__(self, other):
             return NotImplemented
 
         result = Ciphertext(ctx, ld=self.l.like_empty())
-        or_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write())
+        or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
 
         return result
 
@@ -106,7 +106,7 @@ def __and__(self, other):
             return NotImplemented
 
         result = Ciphertext(ctx, ld=self.l.like_empty())
-        and_kernel[32, 16, ctx](self.l.read(), other.l.read(), result.l.write())
+        and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
 
         return result
 
@@ -116,7 +116,7 @@ def set_symbol(self, symbol: str):
 
     def decrypt(self):
         result = Ciphertext(ctx, ld=self.l.like_empty())
-        xor_kernel[32, 16, ctx](self.l.read(), result.l.write(), 0x42)
+        xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42)
 
         return Plaintext(self.ctx, ld=result.l)
 

From 22b2d191608a7a8efde7e288050febbb6dbaeaca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:26:10 +0200
Subject: [PATCH 105/221] Add ciphertext.like_empty()

---
 python/cuda_cccl/tests/stf/test_fhe.py           | 11 +++++++----
 python/cuda_cccl/tests/stf/test_fhe_decorator.py | 11 ++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index e34bfd42834..c42742e952c 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -83,7 +83,7 @@ def __init__(self, ctx, values=None, ld=None):
 
     # ~ operator
     def __invert__(self):
-        result = Ciphertext(ctx, values=None, ld=self.l.like_empty())
+        result = self.like_empty()
 
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -98,7 +98,7 @@ def __or__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -114,7 +114,7 @@ def __and__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
@@ -131,7 +131,8 @@ def set_symbol(self, symbol: str):
         self.symbol = symbol
 
     def decrypt(self):
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
+
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             da = t.get_arg_numba(0)
@@ -141,6 +142,8 @@ def decrypt(self):
 
         return Plaintext(self.ctx, ld=result.l)
 
+    def like_empty(self):
+        return Ciphertext(self.ctx, ld=self.l.like_empty())
 
 def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 024586f743c..8ed6ebbade5 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -84,8 +84,7 @@ def __init__(self, ctx, values=None, ld=None):
 
     # ~ operator
     def __invert__(self):
-        result = Ciphertext(ctx, values=None, ld=self.l.like_empty())
-
+        result = self.like_empty()
         not_kernel[32, 16](self.l.read(), result.l.write())
 
         return result
@@ -95,7 +94,7 @@ def __or__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
         or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
 
         return result
@@ -105,7 +104,7 @@ def __and__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
 
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
         and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
 
         return result
@@ -115,11 +114,13 @@ def set_symbol(self, symbol: str):
         self.symbol = symbol
 
     def decrypt(self):
-        result = Ciphertext(ctx, ld=self.l.like_empty())
+        result = self.like_empty()
         xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42)
 
         return Plaintext(self.ctx, ld=result.l)
 
+    def like_empty(self):
+        return Ciphertext(self.ctx, ld=self.l.like_empty())
 
 def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))

From 66bcde3eef60d9c87987987129946e914fc116fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:30:42 +0200
Subject: [PATCH 106/221] Removing prints

---
 cudax/include/cuda/experimental/__stf/internal/task.cuh   | 1 -
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx     | 2 +-
 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py  | 8 ++++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh
index 2b9945c4d81..7c2535e6a97 100644
--- a/cudax/include/cuda/experimental/__stf/internal/task.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/task.cuh
@@ -369,7 +369,6 @@ public:
 
   void enable_capture()
   {
-    fprintf(stderr, "task enable capture (generic task)\n");
     pimpl->enable_capture = true;
   }
 
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index e2c9fa29e26..49453be4ad3 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -464,7 +464,7 @@ cdef class context:
             raise RuntimeError("cannot call borrow_from_handle on this context")
 
         self._ctx = ctx_handle
-        print(f"borrowing ... new ctx handle = {<int>ctx_handle} self={self}")
+        # print(f"borrowing ... new ctx handle = {<int>ctx_handle} self={self}")
 
     def __repr__(self):
         return f"context(handle={<int>self._ctx}, borrowed={self._borrowed})"
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index eaf07a12610..37ed671ae00 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -58,7 +58,7 @@ def __call__(self, *args, **kwargs):
 
         dep_items = []
         for i, a in enumerate(args):
-            print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
+            # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
                 if ctx == None:
                     ld = a.get_ld()
@@ -72,13 +72,13 @@ def __call__(self, *args, **kwargs):
 
         with ctx.task(*task_args) as t:
             dev_args = list(args)
-            print(dev_args)
+            # print(dev_args)
             for dep_index, (pos, _) in enumerate(dep_items):
-                print(f"set arg {dep_index} at position {pos}")
+                # print(f"set arg {dep_index} at position {pos}")
                 dev_args[pos] = t.get_arg_numba(dep_index)
 
             if self._compiled_kernel is None:
-                print("compile kernel")
+                # print("compile kernel")
                 self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(
                     self._pyfunc
                 )

From 84534c8a2724a6b26cdf870ea46f169049126605 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:54:22 +0200
Subject: [PATCH 107/221] do not import specific methods

---
 python/cuda_cccl/tests/stf/test_fhe.py           | 11 +++--------
 python/cuda_cccl/tests/stf/test_fhe_decorator.py | 10 ++--------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index c42742e952c..63f2ec8dc02 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -10,12 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (
-    context,
-    data_place,
-    exec_place,
-)
-
+import cuda.cccl.experimental.stf as cudastf
 
 class Plaintext:
     # Initialize from actual values, or from a logical data
@@ -37,7 +32,7 @@ def encrypt(self) -> "Ciphertext":
         return Ciphertext(self.ctx, values=encrypted)
 
     def print_values(self):
-        with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t:
+        with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.get_arg_numba(0)
             print([v for v in hvalues])
@@ -149,7 +144,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
 
-ctx = context(use_graph=False)
+ctx = cudastf.context(use_graph=False)
 
 vA = [3, 3, 2, 2, 17]
 pA = Plaintext(ctx, vA)
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 8ed6ebbade5..a84ed8687c6 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -11,12 +11,6 @@
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 import cuda.cccl.experimental.stf as cudastf
-from cuda.cccl.experimental.stf._stf_bindings import (
-    context,
-    data_place,
-    exec_place,
-)
-
 
 class Plaintext:
     # Initialize from actual values, or from a logical data
@@ -38,7 +32,7 @@ def encrypt(self) -> "Ciphertext":
         return Ciphertext(self.ctx, values=encrypted)
 
     def print_values(self):
-        with ctx.task(exec_place.host(), self.l.read(data_place.managed())) as t:
+        with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.get_arg_numba(0)
             print([v for v in hvalues])
@@ -126,7 +120,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
 
-ctx = context(use_graph=False)
+ctx = cudastf.context(use_graph=False)
 
 vA = [3, 3, 2, 2, 17]
 pA = Plaintext(ctx, vA)

From acf0cce6035f71407f9bc102df09135b2eba3f92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 14:56:26 +0200
Subject: [PATCH 108/221] fix decorator api

---
 python/cuda_cccl/tests/stf/test_stencil_decorator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index c20414190db..8e52a72f00a 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -56,7 +56,7 @@ def test_numba2d():
         (ny + threads_per_block[1] - 1) // threads_per_block[1],
     )
 
-    laplacian_5pt_kernel[blocks_per_grid, threads_per_block, ctx](
+    laplacian_5pt_kernel[blocks_per_grid, threads_per_block](
         lu.read(), lu_out.write(), dx, dy
     )
 

From 6a6e84fbe0377ae6b152acd4f34754510039635b Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 08:05:28 -0700
Subject: [PATCH 109/221] Add a pytorch experiment

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 85 ++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_pytorch.py

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
new file mode 100644
index 00000000000..41d1b2f1ca9
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+import numba
+import numpy as np
+import pytest
+import torch
+from numba import cuda
+
+numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+from cuda.cccl.experimental.stf._stf_bindings import (
+    context, 
+    data_place,
+    exec_place,
+    read,
+    rw,
+    write,
+)
+
+import torch
+
+def torch_from_cai(obj):
+    """
+    Convert an object exposing the CUDA Array Interface (__cuda_array_interface__)
+    into a torch.Tensor (on GPU). Zero-copy if possible.
+
+    Strategy:
+      1. If obj has .to_dlpack(), use it directly.
+      2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack.
+    """
+    # Path 1: direct DLPack (Numba >=0.53, some other libs)
+    if hasattr(obj, "to_dlpack"):
+        return torch.utils.dlpack.from_dlpack(obj.to_dlpack())
+
+    # Path 2: via CuPy bridge
+    try:
+        import cupy as cp
+    except ImportError as e:
+        raise RuntimeError(
+            "Object does not support .to_dlpack and CuPy is not installed. "
+            "Cannot convert __cuda_array_interface__ to torch.Tensor."
+        ) from e
+
+    # CuPy knows how to wrap CAI
+    cupy_arr = cp.asarray(obj)
+    return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack())
+
+
+def test_pytorch():
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(rw(lX)) as t:
+        sptr = t.stream_ptr()
+        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        with torch.cuda.stream(torch_stream):
+            # dX = t.get_arg_numba(0)
+            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+            tX = torch_from_cai(dX)
+            # capsule = dX.toDlpack()
+            # tX = torch.utils.dlpack.from_dlpack(capsule)
+            tX = tX*2
+            pass
+#        nb_stream = cuda.external_stream(t.stream_ptr())
+
+ #       dX = dX * 2
+        pass
+
+    ctx.finalize()
+
+if __name__ == "__main__":
+    print("Running CUDASTF examples...")
+    test_pytorch()
+

From 297a69bd6a562cbd49595d92e98e35aa61874f0a Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 08:17:18 -0700
Subject: [PATCH 110/221] more pytorch test

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 40 +++++++++++++++++++---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 41d1b2f1ca9..6949292c53c 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -65,16 +65,46 @@ def test_pytorch():
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            # dX = t.get_arg_numba(0)
             dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
             tX = torch_from_cai(dX)
-            # capsule = dX.toDlpack()
-            # tX = torch.utils.dlpack.from_dlpack(capsule)
             tX = tX*2
             pass
-#        nb_stream = cuda.external_stream(t.stream_ptr())
+        pass
 
- #       dX = dX * 2
+    with ctx.task(lX.read(), lY.write()) as t:
+        sptr = t.stream_ptr()
+        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        with torch.cuda.stream(torch_stream):
+            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+            tX = torch_from_cai(dX)
+            dY = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
+            tY = torch_from_cai(dY)
+            tY = tX*2
+            pass
+        pass
+
+    with ctx.task(lX.read(), lZ.write()) as t:
+        sptr = t.stream_ptr()
+        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        with torch.cuda.stream(torch_stream):
+            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+            tX = torch_from_cai(dX)
+            dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
+            tZ = torch_from_cai(dY)
+            tZ = tX*4 + 1
+            pass
+        pass
+
+    with ctx.task(lY.read(), lZ.rw()) as t:
+        sptr = t.stream_ptr()
+        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        with torch.cuda.stream(torch_stream):
+            dY = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+            tY = torch_from_cai(dX)
+            dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
+            tZ = torch_from_cai(dY)
+            tZ = tY*2 - 3
+            pass
         pass
 
     ctx.finalize()

From 533ca5a67af0d9e8e7e7ff74196cac6efdaf204d Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 08:41:01 -0700
Subject: [PATCH 111/221] better interop with pytorch

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 38 +++++++++++++++++++
 python/cuda_cccl/tests/stf/test_pytorch.py    | 19 ++++------
 2 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 49453be4ad3..185b0e03467 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -160,6 +160,40 @@ class stf_arg_cai:
             'stream': self.stream,         # CUDA stream for access
         }
 
+import torch
+
+class CAIWrapper:
+    def __init__(self, cai_dict):
+        self.__cuda_array_interface__ = cai_dict
+
+def torch_from_cai(obj):
+    """
+    Convert an object exposing the CUDA Array Interface (__cuda_array_interface__)
+    into a torch.Tensor (on GPU). Zero-copy if possible.
+
+    Strategy:
+      1. If obj has .to_dlpack(), use it directly.
+      2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack.
+    """
+    # Path 1: direct DLPack (Numba >=0.53, some other libs)
+    if hasattr(obj, "to_dlpack"):
+        return torch.utils.dlpack.from_dlpack(obj.to_dlpack())
+
+    # Path 2: via CuPy bridge
+    try:
+        import cupy as cp
+    except ImportError as e:
+        raise RuntimeError(
+            "Object does not support .to_dlpack and CuPy is not installed. "
+            "Cannot convert __cuda_array_interface__ to torch.Tensor."
+        ) from e
+
+    #if isinstance(obj, dict) and "__cuda_array_interface__" in obj:
+    obj = CAIWrapper(obj)   # wrap the dict
+
+    cupy_arr = cp.asarray(obj)
+    return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack())
+
 cdef class logical_data:
     cdef stf_logical_data_handle _ld
     cdef stf_ctx_handle _ctx
@@ -430,6 +464,10 @@ cdef class task:
         cai = self.get_arg_cai(index)
         return cuda.from_cuda_array_interface(cai, owner=None, sync=False)
 
+    def get_arg_as_tensor(self, index):
+        cai = self.get_arg_cai(index)
+        return torch_from_cai(cai)
+
     # ---- context‑manager helpers -------------------------------
     def __enter__(self):
         self.start()
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 6949292c53c..d5d4c913aea 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -67,6 +67,7 @@ def test_pytorch():
         with torch.cuda.stream(torch_stream):
             dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
             tX = torch_from_cai(dX)
+            # same as tX =t.get_arg_as_tensor(0) 
             tX = tX*2
             pass
         pass
@@ -75,10 +76,8 @@ def test_pytorch():
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
-            tX = torch_from_cai(dX)
-            dY = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
-            tY = torch_from_cai(dY)
+            tX =t.get_arg_as_tensor(0) 
+            tY =t.get_arg_as_tensor(1) 
             tY = tX*2
             pass
         pass
@@ -87,10 +86,8 @@ def test_pytorch():
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
-            tX = torch_from_cai(dX)
-            dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
-            tZ = torch_from_cai(dY)
+            tX =t.get_arg_as_tensor(0) 
+            tZ =t.get_arg_as_tensor(1) 
             tZ = tX*4 + 1
             pass
         pass
@@ -99,10 +96,8 @@ def test_pytorch():
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            dY = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
-            tY = torch_from_cai(dX)
-            dZ = cuda.from_cuda_array_interface(t.get_arg_cai(1), owner=None, sync=False)
-            tZ = torch_from_cai(dY)
+            tY =t.get_arg_as_tensor(0) 
+            tZ =t.get_arg_as_tensor(1) 
             tZ = tY*2 - 3
             pass
         pass

From 9aa749f038d37068698f4c36824e48a578a560f5 Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 08:49:28 -0700
Subject: [PATCH 112/221] remove useless pass

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index d5d4c913aea..f44226f476f 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -69,8 +69,6 @@ def test_pytorch():
             tX = torch_from_cai(dX)
             # same as tX =t.get_arg_as_tensor(0) 
             tX = tX*2
-            pass
-        pass
 
     with ctx.task(lX.read(), lY.write()) as t:
         sptr = t.stream_ptr()
@@ -79,8 +77,6 @@ def test_pytorch():
             tX =t.get_arg_as_tensor(0) 
             tY =t.get_arg_as_tensor(1) 
             tY = tX*2
-            pass
-        pass
 
     with ctx.task(lX.read(), lZ.write()) as t:
         sptr = t.stream_ptr()
@@ -89,8 +85,6 @@ def test_pytorch():
             tX =t.get_arg_as_tensor(0) 
             tZ =t.get_arg_as_tensor(1) 
             tZ = tX*4 + 1
-            pass
-        pass
 
     with ctx.task(lY.read(), lZ.rw()) as t:
         sptr = t.stream_ptr()
@@ -99,8 +93,6 @@ def test_pytorch():
             tY =t.get_arg_as_tensor(0) 
             tZ =t.get_arg_as_tensor(1) 
             tZ = tY*2 - 3
-            pass
-        pass
 
     ctx.finalize()
 

From b11aa4b3e53faa7b8c18240d34498463ffe3c837 Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 09:28:00 -0700
Subject: [PATCH 113/221] tensor_arguments

---
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx       | 4 ++++
 python/cuda_cccl/tests/stf/test_pytorch.py                  | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 185b0e03467..9d04737753e 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -468,6 +468,10 @@ cdef class task:
         cai = self.get_arg_cai(index)
         return torch_from_cai(cai)
 
+    def tensor_arguments(self):
+        arg_cnt=len(self._lds_args)
+        return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt))
+
     # ---- context‑manager helpers -------------------------------
     def __enter__(self):
         self.start()
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index f44226f476f..8ca1585dd24 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -82,16 +82,14 @@ def test_pytorch():
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            tX =t.get_arg_as_tensor(0) 
-            tZ =t.get_arg_as_tensor(1) 
+            tX, tY = t.tensor_arguments()
             tZ = tX*4 + 1
 
     with ctx.task(lY.read(), lZ.rw()) as t:
         sptr = t.stream_ptr()
         torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
         with torch.cuda.stream(torch_stream):
-            tY =t.get_arg_as_tensor(0) 
-            tZ =t.get_arg_as_tensor(1) 
+            tX, tZ = t.tensor_arguments()
             tZ = tY*2 - 3
 
     ctx.finalize()

From 0af151f2a6a27d95f7201567226dac1faa0176c9 Mon Sep 17 00:00:00 2001
From: root <root@batch-block1-2104.cm.cluster>
Date: Fri, 29 Aug 2025 10:19:51 -0700
Subject: [PATCH 114/221] simpler code

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 24 ++++++++--------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 8ca1585dd24..98c0f9cb220 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -62,8 +62,7 @@ def test_pytorch():
     lZ = ctx.logical_data(Z)
 
     with ctx.task(rw(lX)) as t:
-        sptr = t.stream_ptr()
-        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
             dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
             tX = torch_from_cai(dX)
@@ -71,26 +70,19 @@ def test_pytorch():
             tX = tX*2
 
     with ctx.task(lX.read(), lY.write()) as t:
-        sptr = t.stream_ptr()
-        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
+        torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
             tX =t.get_arg_as_tensor(0) 
             tY =t.get_arg_as_tensor(1) 
             tY = tX*2
 
-    with ctx.task(lX.read(), lZ.write()) as t:
-        sptr = t.stream_ptr()
-        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
-        with torch.cuda.stream(torch_stream):
-            tX, tY = t.tensor_arguments()
-            tZ = tX*4 + 1
+    with ctx.task(lX.read(), lZ.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        tX, tY = t.tensor_arguments()
+        tZ = tX*4 + 1
 
-    with ctx.task(lY.read(), lZ.rw()) as t:
-        sptr = t.stream_ptr()
-        torch_stream = torch.cuda.ExternalStream(sptr, device=torch.device("cuda:0"))
-        with torch.cuda.stream(torch_stream):
-            tX, tZ = t.tensor_arguments()
-            tZ = tY*2 - 3
+    with ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        tX, tZ = t.tensor_arguments()
+        tZ = tY*2 - 3
 
     ctx.finalize()
 

From 746d308b4a82b682a3dcf2b963107a47a63110d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 29 Aug 2025 21:34:28 +0200
Subject: [PATCH 115/221] pre-commit hooks

---
 .../cuda/cccl/experimental/stf/decorator.py   |  9 ++---
 python/cuda_cccl/tests/stf/test_fhe.py        |  6 ++-
 .../cuda_cccl/tests/stf/test_fhe_decorator.py |  6 ++-
 python/cuda_cccl/tests/stf/test_pytorch.py    | 40 ++++++++++---------
 4 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 37ed671ae00..c7179d2a6fc 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -43,7 +43,6 @@ def __getitem__(self, cfg):
         if ctx is not None and not isinstance(ctx, context):
             raise TypeError("4th item must be an STF context (or None to infer)")
 
-
         self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl)
 
         return self
@@ -58,7 +57,7 @@ def __call__(self, *args, **kwargs):
 
         dep_items = []
         for i, a in enumerate(args):
-            # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
+            # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
                 if ctx == None:
                     ld = a.get_ld()
@@ -72,13 +71,13 @@ def __call__(self, *args, **kwargs):
 
         with ctx.task(*task_args) as t:
             dev_args = list(args)
-            # print(dev_args)
+            # print(dev_args)
             for dep_index, (pos, _) in enumerate(dep_items):
-                # print(f"set arg {dep_index} at position {pos}")
+                # print(f"set arg {dep_index} at position {pos}")
                 dev_args[pos] = t.get_arg_numba(dep_index)
 
             if self._compiled_kernel is None:
-                # print("compile kernel")
+                # print("compile kernel")
                 self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(
                     self._pyfunc
                 )
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 63f2ec8dc02..9ec86af51c8 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -12,6 +12,7 @@
 
 import cuda.cccl.experimental.stf as cudastf
 
+
 class Plaintext:
     # Initialize from actual values, or from a logical data
     def __init__(self, ctx, values=None, ld=None):
@@ -32,7 +33,9 @@ def encrypt(self) -> "Ciphertext":
         return Ciphertext(self.ctx, values=encrypted)
 
     def print_values(self):
-        with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t:
+        with ctx.task(
+            cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
+        ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.get_arg_numba(0)
             print([v for v in hvalues])
@@ -140,6 +143,7 @@ def decrypt(self):
     def like_empty(self):
         return Ciphertext(self.ctx, ld=self.l.like_empty())
 
+
 def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index a84ed8687c6..bb369b6f250 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -12,6 +12,7 @@
 
 import cuda.cccl.experimental.stf as cudastf
 
+
 class Plaintext:
     # Initialize from actual values, or from a logical data
     def __init__(self, ctx, values=None, ld=None):
@@ -32,7 +33,9 @@ def encrypt(self) -> "Ciphertext":
         return Ciphertext(self.ctx, values=encrypted)
 
     def print_values(self):
-        with ctx.task(cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())) as t:
+        with ctx.task(
+            cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
+        ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.get_arg_numba(0)
             print([v for v in hvalues])
@@ -116,6 +119,7 @@ def decrypt(self):
     def like_empty(self):
         return Ciphertext(self.ctx, ld=self.l.like_empty())
 
+
 def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 98c0f9cb220..a487eaca1c1 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -5,7 +5,6 @@
 
 import numba
 import numpy as np
-import pytest
 import torch
 from numba import cuda
 
@@ -13,15 +12,10 @@
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 from cuda.cccl.experimental.stf._stf_bindings import (
-    context, 
-    data_place,
-    exec_place,
-    read,
+    context,
     rw,
-    write,
 )
 
-import torch
 
 def torch_from_cai(obj):
     """
@@ -64,29 +58,37 @@ def test_pytorch():
     with ctx.task(rw(lX)) as t:
         torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
-            dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
+            dX = cuda.from_cuda_array_interface(
+                t.get_arg_cai(0), owner=None, sync=False
+            )
             tX = torch_from_cai(dX)
-            # same as tX =t.get_arg_as_tensor(0) 
-            tX = tX*2
+            # same as tX =t.get_arg_as_tensor(0)
+            tX = tX * 2
 
     with ctx.task(lX.read(), lY.write()) as t:
         torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
-            tX =t.get_arg_as_tensor(0) 
-            tY =t.get_arg_as_tensor(1) 
-            tY = tX*2
-
-    with ctx.task(lX.read(), lZ.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            tX = t.get_arg_as_tensor(0)
+            tY = t.get_arg_as_tensor(1)
+            tY = tX * 2
+
+    with (
+        ctx.task(lX.read(), lZ.write()) as t,
+        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+    ):
         tX, tY = t.tensor_arguments()
-        tZ = tX*4 + 1
+        tZ = tX * 4 + 1
 
-    with ctx.task(lY.read(), lZ.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+    with (
+        ctx.task(lY.read(), lZ.rw()) as t,
+        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+    ):
         tX, tZ = t.tensor_arguments()
-        tZ = tY*2 - 3
+        tZ = tY * 2 - 3
 
     ctx.finalize()
 
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_pytorch()
-

From d9195f581410c8e1fb6ea9ddfdae8cc649c0c27e Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 16:50:23 +0200
Subject: [PATCH 116/221] try to remove dependency on torch and have adapters
 (WIP)

---
 .../cuda/cccl/experimental/stf/__init__.py    |  8 ++++
 .../stf/_adapters/numba_bridge.py             |  3 ++
 .../stf/_adapters/torch_bridge.py             | 36 +++++++++++++++++
 .../experimental/stf/_stf_bindings_impl.pyx   | 40 +++----------------
 4 files changed, 52 insertions(+), 35 deletions(-)
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
index 873b31b7dcb..2e63ff6f856 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
@@ -13,3 +13,11 @@
     "data_place",
     "jit",
 ]
+
+def has_torch() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("torch") is not None
+
+def has_numba() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("numba") is not None
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
new file mode 100644
index 00000000000..ac42377dcd2
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
@@ -0,0 +1,3 @@
+def cai_to_numba(cai: dict):
+    from numba import cuda as _cuda
+    return _cuda.from_cuda_array_interface(cai)
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
new file mode 100644
index 00000000000..9bf6feea784
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+def cai_to_torch(cai: dict):
+    """
+    Convert a __cuda_array_interface__ dict to a torch.Tensor
+    without making PyTorch a hard dependency of the core extension.
+
+    Strategy (in order):
+      1) Try Numba -> DLPack -> torch (fast & common).
+      2) Try CuPy  -> DLPack -> torch (common on CUDA setups).
+      3) Otherwise, error with a clear message.
+    """
+    import torch
+
+    # 1) Numba bridge
+    try:
+        from numba import cuda as _cuda
+        dev_array = _cuda.from_cuda_array_interface(cai)
+        return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack())
+    except Exception:
+        pass
+
+    # 2) CuPy bridge
+    try:
+        import cupy as cp
+
+        class _cai_wrapper:
+            def __init__(self, d): self.__cuda_array_interface__ = d
+
+        cp_arr = cp.asarray(_cai_wrapper(cai))
+        return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack())
+    except Exception as e:
+        raise RuntimeError(
+            "Could not convert __cuda_array_interface__ to torch.Tensor. "
+            "Install numba or cupy (or expose a DLPack capsule natively)."
+        ) from e
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 9d04737753e..be4cce311b9 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -160,40 +160,6 @@ class stf_arg_cai:
             'stream': self.stream,         # CUDA stream for access
         }
 
-import torch
-
-class CAIWrapper:
-    def __init__(self, cai_dict):
-        self.__cuda_array_interface__ = cai_dict
-
-def torch_from_cai(obj):
-    """
-    Convert an object exposing the CUDA Array Interface (__cuda_array_interface__)
-    into a torch.Tensor (on GPU). Zero-copy if possible.
-
-    Strategy:
-      1. If obj has .to_dlpack(), use it directly.
-      2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack.
-    """
-    # Path 1: direct DLPack (Numba >=0.53, some other libs)
-    if hasattr(obj, "to_dlpack"):
-        return torch.utils.dlpack.from_dlpack(obj.to_dlpack())
-
-    # Path 2: via CuPy bridge
-    try:
-        import cupy as cp
-    except ImportError as e:
-        raise RuntimeError(
-            "Object does not support .to_dlpack and CuPy is not installed. "
-            "Cannot convert __cuda_array_interface__ to torch.Tensor."
-        ) from e
-
-    #if isinstance(obj, dict) and "__cuda_array_interface__" in obj:
-    obj = CAIWrapper(obj)   # wrap the dict
-
-    cupy_arr = cp.asarray(obj)
-    return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack())
-
 cdef class logical_data:
     cdef stf_logical_data_handle _ld
     cdef stf_ctx_handle _ctx
@@ -466,7 +432,11 @@ cdef class task:
 
     def get_arg_as_tensor(self, index):
         cai = self.get_arg_cai(index)
-        return torch_from_cai(cai)
+        try:
+            from cuda.cccl.experimental.stf._adapters.torch_bridge import cai_to_torch
+        except Exception as e:
+            raise RuntimeError("PyTorch support is not available") from e
+        return cai_to_torch(cai)
 
     def tensor_arguments(self):
         arg_cnt=len(self._lds_args)

From f5ac828ff7bf6b90cf844f737844f0f9e8ba540c Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 16:51:52 +0200
Subject: [PATCH 117/221] remove unused code

---
 .../cuda/cccl/experimental/stf/_stf_bindings_impl.pyx     | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index be4cce311b9..3948ff9a4b4 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -410,14 +410,6 @@ cdef class task:
         cdef CUstream s = stf_task_get_custream(self._t)
         return <uintptr_t> s         # cast pointer -> Py int
 
-    def stream_cdata(self):
-        """
-        Return the raw CUstream as a ctypes void pointer.
-        This can be passed directly to torch.cuda.Stream(cdata=...).
-        """
-        cdef CUstream s = stf_task_get_custream(self._t)
-        return ctypes.c_void_p(<uintptr_t> s)
-
     def get_arg(self, index) -> int:
         cdef void *ptr = stf_task_get(self._t, index)
         return <uintptr_t>ptr

From 454a5dac1af2d32e724d181174d8e48b7ccc23ab Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 18:13:22 +0200
Subject: [PATCH 118/221] cleanups

---
 .../experimental/stf/_stf_bindings_impl.pyx   |  9 +++++---
 python/cuda_cccl/tests/stf/test_numba.py      | 22 ++++---------------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 3948ff9a4b4..d4cda1107c4 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -13,9 +13,8 @@ from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset, memcpy
 
+# TODO remove that dependency
 import numpy as np
-from numba import cuda
-
 
 from cpython.buffer cimport (
     Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
@@ -420,7 +419,11 @@ cdef class task:
 
     def get_arg_numba(self, index):
         cai = self.get_arg_cai(index)
-        return cuda.from_cuda_array_interface(cai, owner=None, sync=False)
+        try:
+            from cuda.cccl.experimental.stf._adapters.numba_bridge import cai_to_numba
+        except Exception as e:
+            raise RuntimeError("numba support is not available") from e
+        return cai_to_numba(cai)
 
     def get_arg_as_tensor(self, index):
         cai = self.get_arg_cai(index)
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index a52276295ac..ed7105386ea 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -44,7 +44,7 @@ def test_numba_graph():
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         scale[32, 64, nb_stream](2.0, dX)
-        pass
+
     ctx.finalize()
 
 
@@ -54,7 +54,7 @@ def test_numba():
     Y = np.ones(n, dtype=np.float32)
     Z = np.ones(n, dtype=np.float32)
 
-    ctx = context(use_graph=True)
+    ctx = context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
@@ -64,7 +64,6 @@ def test_numba():
         dX = t.get_arg_numba(0)
         # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
-        pass
 
     with ctx.task(read(lX), rw(lY)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
@@ -72,21 +71,18 @@ def test_numba():
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
-        pass
 
     with ctx.task(read(lX), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
-        pass
 
     with ctx.task(read(lY), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dY, dZ)
-        pass
 
     ctx.finalize()
 
@@ -145,7 +141,6 @@ def test_numba2d():
         laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream](
             du, du_out, dx, dy
         )
-        pass
 
     ctx.finalize()
 
@@ -183,7 +178,6 @@ def test_numba_exec_place():
         # dX = t.get_arg_numba(0)
         dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
-        pass
 
     with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
@@ -191,7 +185,6 @@ def test_numba_exec_place():
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
-        pass
 
     with ctx.task(
         exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed())
@@ -200,14 +193,12 @@ def test_numba_exec_place():
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
-        pass
 
     with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dY, dZ)
-        pass
 
 
 def test_numba_places():
@@ -228,7 +219,6 @@ def test_numba_places():
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         scale[32, 64, nb_stream](2.0, dX)
-        pass
 
     with ctx.task(lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
@@ -236,24 +226,20 @@ def test_numba_places():
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
-        pass
 
     with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
-        pass
 
     with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dY, dZ)
-        pass
-
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
-    # test_numba_graph()
-    test_numba()
+    test_numba_graph()
+    # test_numba()

From ccfbb6b016b60afe7d2a543f11fb19c6aa6faedd Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 18:23:02 +0200
Subject: [PATCH 119/221] fix numba adapter

---
 .../cuda/cccl/experimental/stf/_adapters/numba_bridge.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
index ac42377dcd2..7e411e91f12 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
@@ -1,3 +1,3 @@
 def cai_to_numba(cai: dict):
-    from numba import cuda as _cuda
-    return _cuda.from_cuda_array_interface(cai)
+    from numba import cuda
+    return cuda.from_cuda_array_interface(cai, owner=None, sync=False)

From c6e7c07ad3641cfa96f24e8a8b83a4417eeed5db Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 18:26:48 +0200
Subject: [PATCH 120/221] skip torch test if torch is not available

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 36 ++--------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index a487eaca1c1..f5bf448b430 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -5,7 +5,8 @@
 
 import numba
 import numpy as np
-import torch
+import pytest
+torch = pytest.importorskip("torch")
 from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
@@ -17,33 +18,6 @@
 )
 
 
-def torch_from_cai(obj):
-    """
-    Convert an object exposing the CUDA Array Interface (__cuda_array_interface__)
-    into a torch.Tensor (on GPU). Zero-copy if possible.
-
-    Strategy:
-      1. If obj has .to_dlpack(), use it directly.
-      2. Otherwise, try to wrap with CuPy (which understands CAI) and then use DLPack.
-    """
-    # Path 1: direct DLPack (Numba >=0.53, some other libs)
-    if hasattr(obj, "to_dlpack"):
-        return torch.utils.dlpack.from_dlpack(obj.to_dlpack())
-
-    # Path 2: via CuPy bridge
-    try:
-        import cupy as cp
-    except ImportError as e:
-        raise RuntimeError(
-            "Object does not support .to_dlpack and CuPy is not installed. "
-            "Cannot convert __cuda_array_interface__ to torch.Tensor."
-        ) from e
-
-    # CuPy knows how to wrap CAI
-    cupy_arr = cp.asarray(obj)
-    return torch.utils.dlpack.from_dlpack(cupy_arr.toDlpack())
-
-
 def test_pytorch():
     n = 1024 * 1024
     X = np.ones(n, dtype=np.float32)
@@ -58,11 +32,7 @@ def test_pytorch():
     with ctx.task(rw(lX)) as t:
         torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
-            dX = cuda.from_cuda_array_interface(
-                t.get_arg_cai(0), owner=None, sync=False
-            )
-            tX = torch_from_cai(dX)
-            # same as tX =t.get_arg_as_tensor(0)
+            tX = t.tensor_arguments()
             tX = tX * 2
 
     with ctx.task(lX.read(), lY.write()) as t:

From 842a6516d64d68e1d0cac06971f2ecaf2a5dca6d Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 19:05:00 +0200
Subject: [PATCH 121/221] add dot vertex even in the low level api

---
 .../cuda/experimental/__stf/graph/graph_task.cuh | 11 ++++++-----
 .../experimental/__stf/stream/stream_task.cuh    | 16 ++++++----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 4896faa28c2..c9426f71c53 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -108,6 +108,12 @@ public:
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
     }
 
+    auto& dot = ctx.get_dot();
+    if (dot.is_tracing())
+    {
+      dot.template add_vertex<task, logical_data_untyped>(*this);
+    }
+
     return *this;
   }
 
@@ -598,11 +604,6 @@ public:
       clear();
     };
 
-    if (dot.is_tracing())
-    {
-      dot.template add_vertex<task, logical_data_untyped>(*this);
-    }
-
     constexpr bool fun_invocable_stream_deps = ::std::is_invocable_v<Fun, cudaStream_t, Deps...>;
     constexpr bool fun_invocable_stream_non_void_deps =
       reserved::is_applicable_v<Fun, reserved::remove_void_interface_from_pack_t<cudaStream_t, Deps...>>;
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index ae5dd3fb77b..4f89ff3d191 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -203,6 +203,12 @@ public:
       insert_dependencies(stream_grid);
     }
 
+    auto& dot = ctx.get_dot();
+    if (dot->is_tracing())
+    {
+      dot->template add_vertex<task, logical_data_untyped>(*this);
+    }
+
     return *this;
   }
 
@@ -308,11 +314,6 @@ public:
       clear();
     };
 
-    if (dot->is_tracing())
-    {
-      dot->template add_vertex<task, logical_data_untyped>(*this);
-    }
-
     // Default for the first argument is a `cudaStream_t`.
     if constexpr (::std::is_invocable_v<Fun, cudaStream_t>)
     {
@@ -575,11 +576,6 @@ public:
       clear();
     };
 
-    if (dot->is_tracing())
-    {
-      dot->template add_vertex<task, logical_data_untyped>(*this);
-    }
-
     if constexpr (::std::is_invocable_v<Fun, cudaStream_t, Data...>)
     {
       // Invoke passing this task's stream as the first argument, followed by the slices

From 00c649cd068f3cc072e2c904cdf91b7b3098991a Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 19:11:02 +0200
Subject: [PATCH 122/221] fix types

---
 cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index c9426f71c53..d243cb32634 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -108,7 +108,7 @@ public:
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
     }
 
-    auto& dot = ctx.get_dot();
+    auto& dot = *ctx.get_dot();
     if (dot.is_tracing())
     {
       dot.template add_vertex<task, logical_data_untyped>(*this);

From b0fc18dbb4a70ea81e33ab8d0c6e16f58ab60489 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 19:11:28 +0200
Subject: [PATCH 123/221] pre-commit hooks

---
 python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py      | 4 ++++
 .../cuda/cccl/experimental/stf/_adapters/numba_bridge.py     | 1 +
 .../cuda/cccl/experimental/stf/_adapters/torch_bridge.py     | 5 ++++-
 python/cuda_cccl/tests/stf/test_numba.py                     | 1 +
 python/cuda_cccl/tests/stf/test_pytorch.py                   | 2 +-
 5 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
index 2e63ff6f856..6ca687dfcb3 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
@@ -14,10 +14,14 @@
     "jit",
 ]
 
+
 def has_torch() -> bool:
     import importlib.util
+
     return importlib.util.find_spec("torch") is not None
 
+
 def has_numba() -> bool:
     import importlib.util
+
     return importlib.util.find_spec("numba") is not None
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
index 7e411e91f12..32b160ba879 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
@@ -1,3 +1,4 @@
 def cai_to_numba(cai: dict):
     from numba import cuda
+
     return cuda.from_cuda_array_interface(cai, owner=None, sync=False)
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
index 9bf6feea784..eda137fb577 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+
 def cai_to_torch(cai: dict):
     """
     Convert a __cuda_array_interface__ dict to a torch.Tensor
@@ -15,6 +16,7 @@ def cai_to_torch(cai: dict):
     # 1) Numba bridge
     try:
         from numba import cuda as _cuda
+
         dev_array = _cuda.from_cuda_array_interface(cai)
         return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack())
     except Exception:
@@ -25,7 +27,8 @@ def cai_to_torch(cai: dict):
         import cupy as cp
 
         class _cai_wrapper:
-            def __init__(self, d): self.__cuda_array_interface__ = d
+            def __init__(self, d):
+                self.__cuda_array_interface__ = d
 
         cp_arr = cp.asarray(_cai_wrapper(cai))
         return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack())
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index ed7105386ea..35fb749c68c 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -239,6 +239,7 @@ def test_numba_places():
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dY, dZ)
 
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_numba_graph()
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index f5bf448b430..8c1349b89e5 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -6,8 +6,8 @@
 import numba
 import numpy as np
 import pytest
+
 torch = pytest.importorskip("torch")
-from numba import cuda
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0

From 04cc07ac73a750b1b4598594dbb75e1dae08e6fa Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Sun, 31 Aug 2025 19:29:29 +0200
Subject: [PATCH 124/221] dot add_vertex is done in start() now

---
 .../cuda/experimental/__stf/internal/cuda_kernel_scope.cuh  | 6 ------
 .../cuda/experimental/__stf/internal/host_launch_scope.cuh  | 5 -----
 cudax/include/cuda/experimental/__stf/internal/launch.cuh   | 5 -----
 .../cuda/experimental/__stf/internal/parallel_for_scope.cuh | 5 -----
 4 files changed, 21 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
index a5443643f40..38011a1e844 100644
--- a/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/cuda_kernel_scope.cuh
@@ -376,12 +376,6 @@ public:
       }
     }
 
-    auto& dot = *ctx.get_dot();
-    if (dot.is_tracing())
-    {
-      dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
-    }
-
     return *this;
   }
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index 9783dc53f38..df02b67f20c 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -139,11 +139,6 @@ public:
       t.clear();
     };
 
-    if (dot.is_tracing())
-    {
-      dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
-    }
-
     auto payload = [&]() {
       if constexpr (called_from_launch)
       {
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index 6c42032a949..b0e34edc253 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -360,11 +360,6 @@ public:
     nvtx_range nr(t.get_symbol().c_str());
     t.start();
 
-    if (dot.is_tracing())
-    {
-      dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
-    }
-
     int device;
     cudaEvent_t start_event, end_event;
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index 6c66e918015..7bf0a819415 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -583,11 +583,6 @@ public:
       }
     }
 
-    if (dot.is_tracing())
-    {
-      dot.template add_vertex<typename context::task_type, logical_data_untyped>(t);
-    }
-
     static constexpr bool need_reduction = (deps_ops_t::does_work || ...);
 
 #  if __NVCOMPILER

From bce25b8f9e220b39afe8803c97e92edf9cacb2a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 11:47:04 +0200
Subject: [PATCH 125/221] Start to implement the FDTD example in pytorch

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
new file mode 100644
index 00000000000..1a0c9483f6b
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -0,0 +1,112 @@
+import math
+from typing import Tuple, Optional
+
+import torch
+
+def fdtd_3d_pytorch(
+    size_x: int = 100,
+    size_y: int = 100,
+    size_z: int = 100,
+    timesteps: int = 10,
+    output_freq: int = 0,
+    dx: float = 0.01,
+    dy: float = 0.01,
+    dz: float = 0.01,
+    epsilon0: float = 8.85e-12,
+    mu0: float = 1.256e-6,
+    device: Optional[torch.device] = None,
+    dtype: torch.dtype = torch.float64,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # allocate fields
+    shape = (size_x, size_y, size_z)
+    ex = torch.zeros(shape, dtype=dtype, device=device)
+    ey = torch.zeros_like(ex)
+    ez = torch.zeros_like(ex)
+
+    hx = torch.zeros_like(ex)
+    hy = torch.zeros_like(ex)
+    hz = torch.zeros_like(ex)
+
+    epsilon = torch.full(shape, float(epsilon0), dtype=dtype, device=device)
+    mu = torch.full(shape, float(mu0), dtype=dtype, device=device)
+
+    # CFL (same formula as example)
+    dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
+
+    # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1
+    i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1)
+    i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2)
+
+    # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1
+    i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1)
+    i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None)
+
+    # source location (single cell at center)
+    cx, cy, cz = size_x // 2, size_y // 2, size_z // 2
+
+    def source(t: float, x: float, y: float, z: float) -> float:
+        # sin(k*x - omega*t) with f = 1e9 Hz
+        pi = math.pi
+        freq = 1.0e9
+        omega = 2.0 * pi * freq
+        wavelength = 3.0e8 / freq
+        k = 2.0 * pi / wavelength
+        return math.sin(k * x - omega * t)
+
+    for n in range(int(timesteps)):
+        # -------------------------
+        # update electric fields (Es)
+        # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
+        ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * (
+            (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
+            - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
+        )
+
+        # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
+        ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * (
+            (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
+            - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
+        )
+
+        # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
+        ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * (
+            (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
+            - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
+        )
+
+        # source at center cell
+        ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+
+        # -------------------------
+        # update magnetic fields (Hs)
+        # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
+        hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * (
+            (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
+            - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
+        )
+
+        # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
+        hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * (
+            (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
+            - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
+        )
+
+        # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
+        hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * (
+            (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
+            - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
+        )
+
+        if output_freq > 0 and (n % output_freq) == 0:
+            print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+
+    return ex, ey, ez, hx, hy, hz
+
+
+if __name__ == "__main__":
+    # quick check
+    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
+    print("done; Ez(center) =", ez[50, 50, 50].item())

From d9c5f1194e3d0f35d1871e94c0dc2654d711841f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 12:02:48 +0200
Subject: [PATCH 126/221] Start to port in STF version of pytorch

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 106 ++++++++++++------
 1 file changed, 69 insertions(+), 37 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 1a0c9483f6b..f8cc500a026 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -1,6 +1,11 @@
 import math
 from typing import Tuple, Optional
 
+from cuda.cccl.experimental.stf._stf_bindings import (
+    context,
+    rw,
+)
+
 import torch
 
 def fdtd_3d_pytorch(
@@ -17,21 +22,32 @@ def fdtd_3d_pytorch(
     device: Optional[torch.device] = None,
     dtype: torch.dtype = torch.float64,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    if device is None:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    ctx = context()
 
     # allocate fields
     shape = (size_x, size_y, size_z)
-    ex = torch.zeros(shape, dtype=dtype, device=device)
-    ey = torch.zeros_like(ex)
-    ez = torch.zeros_like(ex)
+    ex_ = torch.zeros(shape, dtype=dtype, device=device)
+    ey_ = torch.zeros_like(ex_)
+    ez_ = torch.zeros_like(ex_)
+
+    hx_ = torch.zeros_like(ex_)
+    hy_ = torch.zeros_like(ex_)
+    hz_ = torch.zeros_like(ex_)
 
-    hx = torch.zeros_like(ex)
-    hy = torch.zeros_like(ex)
-    hz = torch.zeros_like(ex)
+    epsilon_ = torch.full(shape, float(epsilon0), dtype=dtype, device=device)
+    mu_ = torch.full(shape, float(mu0), dtype=dtype, device=device)
 
-    epsilon = torch.full(shape, float(epsilon0), dtype=dtype, device=device)
-    mu = torch.full(shape, float(mu0), dtype=dtype, device=device)
+    lex = ctx.logical_data(ex_)
+    ley = ctx.logical_data(ey_)
+    lez = ctx.logical_data(ez_)
+
+    lhx = ctx.logical_data(hx_)
+    lhy = ctx.logical_data(hy_)
+    lhz = ctx.logical_data(hz_)
+
+    lepsilon = ctx.logical_data(epsilon_)
+    lmu = ctx.logical_data(mu_)
 
     # CFL (same formula as example)
     dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
@@ -60,48 +76,64 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # -------------------------
         # update electric fields (Es)
         # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
-        ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * (
-            (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
-            - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
-        )
+        with ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            ex, hy, hz, epsilon = t.tensor_arguments()
+            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * (
+                (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
+                - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
+            )
 
         # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
-        ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * (
-            (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
-            - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
-        )
+        with ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            ey, hx, hz, epsilon = t.tensor_arguments()
+            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * (
+                (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
+                - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
+            )
 
         # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
-        ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * (
-            (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
-            - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
-        )
+        with ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            ez, hx, hy, epsilon = t.tensor_arguments()
+            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * (
+                (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
+                - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
+            )
 
         # source at center cell
-        ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+        with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            ez = t.tensor_arguments()
+            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
 
         # -------------------------
         # update magnetic fields (Hs)
         # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
-        hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * (
-            (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
-            - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
-        )
+        with ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            hx, ey, ez, mu = t.tensor_arguments()
+            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * (
+                (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
+                - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
+            )
 
         # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
-        hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * (
-            (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
-            - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
-        )
+        with ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            hy, ex, ez, mu = t.tensor_arguments()
+            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * (
+                (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
+                - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
+            )
 
         # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
-        hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * (
-            (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
-            - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
-        )
+        with ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            hz, ex, ey, mu = t.tensor_arguments()
+            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * (
+                (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
+                - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
+            )
+
+#         if output_freq > 0 and (n % output_freq) == 0:
+#             print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
 
-        if output_freq > 0 and (n % output_freq) == 0:
-            print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+    ctx.finalize()
 
     return ex, ey, ez, hx, hy, hz
 

From 70fa5d8c001b51e0d6de180d22ffc9b3fa8d9c3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 13:54:51 +0200
Subject: [PATCH 127/221] Adapt the FDTD example to use STF constructs and add
 methods to initialize a logical data by its shape

---
 .../experimental/stf/_stf_bindings_impl.pyx   | 21 +++++-
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 67 +++++++++++--------
 2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index d4cda1107c4..3b4ba4db39b 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -12,6 +12,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset, memcpy
+import math # for math.prod
 
 # TODO remove that dependency
 import numpy as np
@@ -168,7 +169,7 @@ cdef class logical_data:
     cdef int    _ndim
     cdef size_t _len
 
-    def __cinit__(self, context ctx=None, object buf=None):
+    def __cinit__(self, context ctx=None, object buf=None, shape=None, dtype=None):
         if ctx is None or buf is None:
             # allow creation via __new__ (eg. in like_empty)
             self._ld = NULL
@@ -242,6 +243,21 @@ cdef class logical_data:
 
         return out
 
+    @staticmethod
+    def init_by_shape(context ctx, shape, dtype):
+        """
+        Create a new logical_data from a  shape and a dtype
+        """
+        cdef logical_data out = logical_data.__new__(logical_data)
+        out._ctx   = ctx._ctx
+        out._dtype = np.dtype(dtype)
+        out._shape = shape
+        out._ndim  = len(shape)
+        out._len   = math.prod(shape) * out._dtype.itemsize
+        stf_logical_data_empty(ctx._ctx, out._len, &out._ld)
+
+        return out
+
     def borrow_ctx_handle(self):
         ctx = context(borrowed=True)
         ctx.borrow_from_handle(self._ctx)
@@ -499,6 +515,9 @@ cdef class context:
         """
         return logical_data(self, buf)
 
+    def logical_data_by_shape(self, shape, dtype):
+        return logical_data.init_by_shape(self, shape, dtype)
+
     def task(self, *args):
         """
         Create a `task`
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index f8cc500a026..fa7d49d976b 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -1,5 +1,6 @@
 import math
 from typing import Tuple, Optional
+import numpy as np
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
@@ -8,6 +9,11 @@
 
 import torch
 
+def init_field(ctx, ld, value):
+    with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        field = t.get_arg_as_tensor(0)
+        field[:,:,:] = value
+
 def fdtd_3d_pytorch(
     size_x: int = 100,
     size_y: int = 100,
@@ -27,28 +33,33 @@ def fdtd_3d_pytorch(
 
     # allocate fields
     shape = (size_x, size_y, size_z)
-    ex_ = torch.zeros(shape, dtype=dtype, device=device)
-    ey_ = torch.zeros_like(ex_)
-    ez_ = torch.zeros_like(ex_)
-
-    hx_ = torch.zeros_like(ex_)
-    hy_ = torch.zeros_like(ex_)
-    hz_ = torch.zeros_like(ex_)
-
-    epsilon_ = torch.full(shape, float(epsilon0), dtype=dtype, device=device)
-    mu_ = torch.full(shape, float(mu0), dtype=dtype, device=device)
-
-    lex = ctx.logical_data(ex_)
-    ley = ctx.logical_data(ey_)
-    lez = ctx.logical_data(ez_)
-
-    lhx = ctx.logical_data(hx_)
-    lhy = ctx.logical_data(hy_)
-    lhz = ctx.logical_data(hz_)
-
-    lepsilon = ctx.logical_data(epsilon_)
-    lmu = ctx.logical_data(mu_)
-
+#    ex_ = torch.zeros(shape, dtype=dtype, device=device)
+    lex = ctx.logical_data_by_shape(shape, np.float64)
+    ley = ctx.logical_data_by_shape(shape, np.float64)
+    lez = ctx.logical_data_by_shape(shape, np.float64)
+
+    # epsilon_ = torch.full(shape, float(epsilon0), np.float64=np.float64, device=device)
+    # mu_ = torch.full(shape, float(mu0), np.float64=np.float64, device=device)
+
+    lhx = ctx.logical_data_by_shape(shape, np.float64)
+    lhy = ctx.logical_data_by_shape(shape, np.float64)
+    lhz = ctx.logical_data_by_shape(shape, np.float64)
+
+    # lepsilon = ctx.logical_data()
+    # lmu = ctx.logical_data(mu_)
+    lepsilon = ctx.logical_data_by_shape(shape, np.float64)
+    lmu = ctx.logical_data_by_shape(shape, np.float64)
+
+    # TODO ctx.full(...)
+    init_field(ctx, lex, float(0.0))
+    init_field(ctx, ley, float(0.0))
+    init_field(ctx, lez, float(0.0))
+    init_field(ctx, lhx, float(0.0))
+    init_field(ctx, lhy, float(0.0))
+    init_field(ctx, lhz, float(0.0))
+    init_field(ctx, lepsilon, float(epsilon0))
+    init_field(ctx, lmu, float(mu0))
+ 
     # CFL (same formula as example)
     dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
 
@@ -99,10 +110,10 @@ def source(t: float, x: float, y: float, z: float) -> float:
                 - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
             )
 
-        # source at center cell
-        with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
-            ez = t.tensor_arguments()
-            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+         # source at center cell
+         with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+             ez = t.get_arg_as_tensor(0)
+             ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
 
         # -------------------------
         # update magnetic fields (Hs)
@@ -140,5 +151,5 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # quick check
-    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
-    print("done; Ez(center) =", ez[50, 50, 50].item())
+    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=20, output_freq=5)
+   #  print("done; Ez(center) =", ez[50, 50, 50].item())

From 5587a8d256f39324b49aab6317062f85de93b0b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 14:05:04 +0200
Subject: [PATCH 128/221] format issue

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index fa7d49d976b..2434e60abf1 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -38,8 +38,8 @@ def fdtd_3d_pytorch(
     ley = ctx.logical_data_by_shape(shape, np.float64)
     lez = ctx.logical_data_by_shape(shape, np.float64)
 
-    # epsilon_ = torch.full(shape, float(epsilon0), np.float64=np.float64, device=device)
-    # mu_ = torch.full(shape, float(mu0), np.float64=np.float64, device=device)
+    # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device)
+    # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device)
 
     lhx = ctx.logical_data_by_shape(shape, np.float64)
     lhy = ctx.logical_data_by_shape(shape, np.float64)
@@ -59,7 +59,7 @@ def fdtd_3d_pytorch(
     init_field(ctx, lhz, float(0.0))
     init_field(ctx, lepsilon, float(epsilon0))
     init_field(ctx, lmu, float(mu0))
- 
+
     # CFL (same formula as example)
     dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
 

From 5ea524360b8c146bc496034d387154dc75d9baca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 14:08:51 +0200
Subject: [PATCH 129/221] charset issue

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 2434e60abf1..2049c8ada49 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -110,10 +110,10 @@ def source(t: float, x: float, y: float, z: float) -> float:
                 - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
             )
 
-         # source at center cell
-         with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
-             ez = t.get_arg_as_tensor(0)
-             ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+        # source at center cell
+        with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            ez = t.get_arg_as_tensor(0)
+            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
 
         # -------------------------
         # update magnetic fields (Hs)

From f7fbd346e412f86a5597a30a6776b99edb1b6dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 14:30:23 +0200
Subject: [PATCH 130/221] rank agnostic method to init

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 2049c8ada49..bd6ee8a9c30 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -12,7 +12,7 @@
 def init_field(ctx, ld, value):
     with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
         field = t.get_arg_as_tensor(0)
-        field[:,:,:] = value
+        field.fill_(value)
 
 def fdtd_3d_pytorch(
     size_x: int = 100,

From aec2d711c2dac0c5e0c34258aa798fd230f6e486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 14:34:40 +0200
Subject: [PATCH 131/221] use .zero_() to blank fields

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index bd6ee8a9c30..86712c4754a 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -12,7 +12,10 @@
 def init_field(ctx, ld, value):
     with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
         field = t.get_arg_as_tensor(0)
-        field.fill_(value)
+        if value == 0:
+            field.zero_()
+        else:
+            field.fill_(value)
 
 def fdtd_3d_pytorch(
     size_x: int = 100,

From eb71880f77ae294217b13f5e112e2d4d19b44cdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 14:41:54 +0200
Subject: [PATCH 132/221] print values

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 86712c4754a..788cb699140 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -144,8 +144,11 @@ def source(t: float, x: float, y: float, z: float) -> float:
                 - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
             )
 
-#         if output_freq > 0 and (n % output_freq) == 0:
-#             print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+        if output_freq > 0 and (n % output_freq) == 0:
+            with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+                ez = t.get_arg_as_tensor(0)
+                print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+            pass
 
     ctx.finalize()
 
@@ -154,5 +157,5 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # quick check
-    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=20, output_freq=5)
+    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=200, output_freq=5)
    #  print("done; Ez(center) =", ez[50, 50, 50].item())

From aaf6ec6bba1c7c5ddb8388145772986efe966f56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 1 Sep 2025 15:38:07 +0200
Subject: [PATCH 133/221] Experiment to display output as an image

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 46 +++++++++++++++++--
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 788cb699140..59f1583cd07 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -1,6 +1,9 @@
 import math
 from typing import Tuple, Optional
 import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import SymLogNorm, LogNorm
+from typing import Literal, Optional
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
@@ -9,6 +12,38 @@
 
 import torch
 
+Plane = Literal["xy", "xz", "yz"]
+
+def show_slice(t3d, plane="xy", index=None):
+    # grab a 2D view
+    if plane == "xy":
+        idx = t3d.shape[2] // 2 if index is None else index
+        slice2d = t3d[:, :, idx]
+    elif plane == "xz":
+        idx = t3d.shape[1] // 2 if index is None else index
+        slice2d = t3d[:, idx, :]
+    elif plane == "yz":
+        idx = t3d.shape[0] // 2 if index is None else index
+        slice2d = t3d[idx, :, :]
+    else:
+        raise ValueError("plane must be 'xy', 'xz' or 'yz'")
+
+    # move to cpu numpy array
+    arr = slice2d.detach().cpu().numpy()
+
+    # imshow = "imshow" not "imread"
+    plt.imshow(
+        arr,
+        origin="lower",
+        cmap="seismic",
+        vmin=-1e-2, vmax=1e-2
+#        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
+#         norm=LogNorm(vmin=1e-12, vmax=1e-6)
+    )
+    # plt.colorbar()
+    plt.show(block=False)
+    plt.pause(0.01)
+
 def init_field(ctx, ld, value):
     with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
         field = t.get_arg_as_tensor(0)
@@ -18,9 +53,9 @@ def init_field(ctx, ld, value):
             field.fill_(value)
 
 def fdtd_3d_pytorch(
-    size_x: int = 100,
-    size_y: int = 100,
-    size_z: int = 100,
+    size_x: int = 150,
+    size_y: int = 150,
+    size_z: int = 150,
     timesteps: int = 10,
     output_freq: int = 0,
     dx: float = 0.01,
@@ -75,7 +110,7 @@ def fdtd_3d_pytorch(
     i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None)
 
     # source location (single cell at center)
-    cx, cy, cz = size_x // 2, size_y // 2, size_z // 2
+    cx, cy, cz = size_x // 2, size_y // 10, size_z // 2
 
     def source(t: float, x: float, y: float, z: float) -> float:
         # sin(k*x - omega*t) with f = 1e9 Hz
@@ -148,6 +183,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
             with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
                 ez = t.get_arg_as_tensor(0)
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+                show_slice(ez, plane="xy")
             pass
 
     ctx.finalize()
@@ -157,5 +193,5 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # quick check
-    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=200, output_freq=5)
+    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
    #  print("done; Ez(center) =", ez[50, 50, 50].item())

From ae4c6d6c77930bd3ea9ee6912b09bfbc76ebb6b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 09:21:02 +0200
Subject: [PATCH 134/221] Use non blocking API

---
 .../cuda/cccl/experimental/stf/_adapters/torch_bridge.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
index eda137fb577..945597fb360 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
@@ -17,7 +17,7 @@ def cai_to_torch(cai: dict):
     try:
         from numba import cuda as _cuda
 
-        dev_array = _cuda.from_cuda_array_interface(cai)
+        dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False)
         return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack())
     except Exception:
         pass

From 9029fda7684f639d60ba323265fa8e191ba78569 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 13:07:29 +0200
Subject: [PATCH 135/221] remove dead code

---
 c/experimental/stf/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index f151e8bf766..11599edaec7 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -36,12 +36,6 @@ target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cuda_driver
   CCCL::cudax
 )
-# target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
-# target_compile_definitions(cccl.c.experimental.stf PRIVATE
-#   NVRTC_GET_TYPE_NAME=1
-#   CUB_DISABLE_CDP=1
-#   CUB_DEFINE_RUNTIME_POLICIES
-# )
 
 target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
 target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)

From ce7a33bdf3ddba5ddd4a6b8698a6832eb08649ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 13:08:18 +0200
Subject: [PATCH 136/221] remove dead code

---
 c/experimental/stf/src/stf.cu | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index d2abedc66d6..cec00fbca6f 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -273,27 +273,6 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
   k->k.start();
 }
 
-#if 0
-//
-//  template <typename Fun>
-//  void configure_raw(Fun func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args)
-void stf_cuda_kernel_add_desc(stf_cuda_kernel_handle k, const void *func, dim3 gridDim_, dim3 blockDim_, size_t sharedMem_, int arg_cnt, const void** args)
-{
-    /* We convert the function to a CUfunction because this code is a shared
-     * library which cannot launch kernels using cudaLaunchKernel directly, or we
-     * will get invalid device function. */
-    //CUfunction cufunc;
-    //cudaGetFuncBySymbol(&cufunc, (void *)func);
-    CUkernel cukernel;
-    cudaGetKernel(&cukernel, (void *)func);
-
-    cuda_kernel_desc desc;
-    desc.configure_raw(cukernel, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
-
-    k->k.add_kernel_desc(mv(desc));
-}
-#endif
-
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,

From cbde742266e2544a787d76104356fa5bdc405a1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 13:24:39 +0200
Subject: [PATCH 137/221] minor cleanup

---
 c/experimental/stf/src/stf.cu | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index cec00fbca6f..0a92d86b677 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -80,18 +80,6 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_
   *to           = new stf_logical_data_handle_t{ld_typed};
 }
 
-// void stf_logical_data_like_empty(stf_ctx_handle ctx, const stf_logical_data_handle from, stf_logical_data_handle* to)
-// {
-//   assert(ctx);
-//   assert(from);
-//   assert(to);
-//
-//   auto ld_typed = ctx->ctx.logical_data(from->ld.shape());
-//
-//   // Stored in its untyped version
-//   *to = new stf_logical_data_handle_t{ld_typed};
-// }
-
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 {
   assert(ctx);
@@ -305,4 +293,5 @@ void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
   assert(t);
   delete t;
 }
-}
+
+} // extern "C"

From c91e814304f9b063acc229d1fd6e8e5cf04a0e33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 13:58:39 +0200
Subject: [PATCH 138/221] clang-format

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 111 ++++++++++++------
 1 file changed, 76 insertions(+), 35 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 59f1583cd07..ccac389d6a6 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -1,19 +1,17 @@
 import math
-from typing import Tuple, Optional
-import numpy as np
+from typing import Literal, Optional, Tuple
+
 import matplotlib.pyplot as plt
-from matplotlib.colors import SymLogNorm, LogNorm
-from typing import Literal, Optional
+import numpy as np
+import torch
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
-    rw,
 )
 
-import torch
-
 Plane = Literal["xy", "xz", "yz"]
 
+
 def show_slice(t3d, plane="xy", index=None):
     # grab a 2D view
     if plane == "xy":
@@ -36,22 +34,28 @@ def show_slice(t3d, plane="xy", index=None):
         arr,
         origin="lower",
         cmap="seismic",
-        vmin=-1e-2, vmax=1e-2
-#        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
-#         norm=LogNorm(vmin=1e-12, vmax=1e-6)
+        vmin=-1e-2,
+        vmax=1e-2,
+        #        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
+        #         norm=LogNorm(vmin=1e-12, vmax=1e-6)
     )
-    # plt.colorbar()
+    # plt.colorbar()
     plt.show(block=False)
     plt.pause(0.01)
 
+
 def init_field(ctx, ld, value):
-    with ctx.task(ld.write()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+    with (
+        ctx.task(ld.write()) as t,
+        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+    ):
         field = t.get_arg_as_tensor(0)
         if value == 0:
             field.zero_()
         else:
             field.fill_(value)
 
+
 def fdtd_3d_pytorch(
     size_x: int = 150,
     size_y: int = 150,
@@ -65,30 +69,31 @@ def fdtd_3d_pytorch(
     mu0: float = 1.256e-6,
     device: Optional[torch.device] = None,
     dtype: torch.dtype = torch.float64,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
     ctx = context()
 
     # allocate fields
     shape = (size_x, size_y, size_z)
-#    ex_ = torch.zeros(shape, dtype=dtype, device=device)
+    #    ex_ = torch.zeros(shape, dtype=dtype, device=device)
     lex = ctx.logical_data_by_shape(shape, np.float64)
     ley = ctx.logical_data_by_shape(shape, np.float64)
     lez = ctx.logical_data_by_shape(shape, np.float64)
 
-    # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device)
-    # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device)
+    # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device)
+    # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device)
 
     lhx = ctx.logical_data_by_shape(shape, np.float64)
     lhy = ctx.logical_data_by_shape(shape, np.float64)
     lhz = ctx.logical_data_by_shape(shape, np.float64)
 
-    # lepsilon = ctx.logical_data()
-    # lmu = ctx.logical_data(mu_)
+    # lepsilon = ctx.logical_data()
+    # lmu = ctx.logical_data(mu_)
     lepsilon = ctx.logical_data_by_shape(shape, np.float64)
     lmu = ctx.logical_data_by_shape(shape, np.float64)
 
-    # TODO ctx.full(...)
+    # TODO ctx.full(...)
     init_field(ctx, lex, float(0.0))
     init_field(ctx, ley, float(0.0))
     init_field(ctx, lez, float(0.0))
@@ -125,62 +130,98 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # -------------------------
         # update electric fields (Es)
         # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
-        with ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             ex, hy, hz, epsilon = t.tensor_arguments()
-            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dx)) * (
+            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dx)
+            ) * (
                 (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
                 - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
             )
 
         # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
-        with ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             ey, hx, hz, epsilon = t.tensor_arguments()
-            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dy)) * (
+            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dy)
+            ) * (
                 (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
                 - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
             )
 
         # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
-        with ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             ez, hx, hy, epsilon = t.tensor_arguments()
-            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (dt / (epsilon[i_es, j_es, k_es] * dz)) * (
+            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dz)
+            ) * (
                 (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
                 - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
             )
 
         # source at center cell
-        with ctx.task(lez.rw()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lez.rw()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             ez = t.get_arg_as_tensor(0)
             ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
 
         # -------------------------
         # update magnetic fields (Hs)
         # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
-        with ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             hx, ey, ez, mu = t.tensor_arguments()
-            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dy)) * (
+            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dy)
+            ) * (
                 (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
                 - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
             )
 
         # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
-        with ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             hy, ex, ez, mu = t.tensor_arguments()
-            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dz)) * (
+            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dz)
+            ) * (
                 (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
                 - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
             )
 
         # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
-        with ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+        with (
+            ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t,
+            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+        ):
             hz, ex, ey, mu = t.tensor_arguments()
-            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (dt / (mu[i_hs, j_hs, k_hs] * dx)) * (
+            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dx)
+            ) * (
                 (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
                 - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
             )
 
         if output_freq > 0 and (n % output_freq) == 0:
-            with ctx.task(lez.read()) as t, torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())):
+            with (
+                ctx.task(lez.read()) as t,
+                torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            ):
                 ez = t.get_arg_as_tensor(0)
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
                 show_slice(ez, plane="xy")
@@ -194,4 +235,4 @@ def source(t: float, x: float, y: float, z: float) -> float:
 if __name__ == "__main__":
     # quick check
     ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
-   #  print("done; Ez(center) =", ez[50, 50, 50].item())
+#  print("done; Ez(center) =", ez[50, 50, 50].item())

From 3fe6178b90d39853cd26cfd910aa54fe8de2eccc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:12:06 +0200
Subject: [PATCH 139/221] Add a C library for CUDASTF (to be used in the python
 bindings)

---
 c/CMakeLists.txt                              |   1 +
 c/experimental/stf/CMakeLists.txt             |  52 +++
 .../stf/include/cccl/c/experimental/stf/stf.h | 224 +++++++++++++
 c/experimental/stf/src/stf.cu                 | 297 ++++++++++++++++++
 c/experimental/stf/test/CMakeLists.txt        |  39 +++
 c/experimental/stf/test/test_ctx.cpp          |  21 ++
 c/experimental/stf/test/test_cuda_kernel.cu   |  90 ++++++
 c/experimental/stf/test/test_logical_data.cpp |  39 +++
 c/experimental/stf/test/test_places.cpp       |  81 +++++
 c/experimental/stf/test/test_task.cpp         |  78 +++++
 c/experimental/stf/test/test_token.cpp        |  78 +++++
 11 files changed, 1000 insertions(+)
 create mode 100644 c/experimental/stf/CMakeLists.txt
 create mode 100644 c/experimental/stf/include/cccl/c/experimental/stf/stf.h
 create mode 100644 c/experimental/stf/src/stf.cu
 create mode 100644 c/experimental/stf/test/CMakeLists.txt
 create mode 100644 c/experimental/stf/test/test_ctx.cpp
 create mode 100644 c/experimental/stf/test/test_cuda_kernel.cu
 create mode 100644 c/experimental/stf/test/test_logical_data.cpp
 create mode 100644 c/experimental/stf/test/test_places.cpp
 create mode 100644 c/experimental/stf/test/test_task.cpp
 create mode 100644 c/experimental/stf/test/test_token.cpp

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 7f1dbf4507b..364494da7a0 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(parallel)
+add_subdirectory(experimental/stf/)
diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
new file mode 100644
index 00000000000..11599edaec7
--- /dev/null
+++ b/c/experimental/stf/CMakeLists.txt
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(CCCL_C_EXPERIMENTAL_STF LANGUAGES CUDA CXX C)
+
+option(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING "Build cccl.experimental.c.stf tests." OFF)
+
+# FIXME Ideally this would be handled by presets and install rules, but for now
+# consumers may override this to control the target location of cccl.c.experimental.stf.
+set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY "" CACHE PATH "Override output directory for the cccl.c.experimental.stf library")
+mark_as_advanced(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
+
+file(GLOB_RECURSE srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  "src/*.cu" "src/*.cuh"
+)
+
+add_library(cccl.c.experimental.stf SHARED ${srcs})
+set_property(TARGET cccl.c.experimental.stf PROPERTY POSITION_INDEPENDENT_CODE ON)
+cccl_configure_target(cccl.c.experimental.stf DIALECT 17)
+
+# Override the properties set by cccl_configure_target:
+if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
+  set_target_properties(cccl.c.experimental.stf PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
+    ARCHIVE_OUTPUT_DIRECTORY "${CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY}"
+  )
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+target_link_libraries(cccl.c.experimental.stf PRIVATE
+  CUDA::cudart_static
+  CUDA::nvrtc
+  CUDA::nvJitLink
+  CUDA::cuda_driver
+  CCCL::cudax
+)
+
+target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
+target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+
+target_include_directories(cccl.c.experimental.stf PUBLIC "include")
+target_include_directories(cccl.c.experimental.stf PRIVATE "src")
+
+if (CCCL_C_Parallel_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
+#   include(cmake/CParallelHeaderTesting.cmake)
+# endif()
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
new file mode 100644
index 00000000000..6f2f903e6c8
--- /dev/null
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -0,0 +1,224 @@
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum stf_access_mode
+{
+  STF_NONE  = 0,
+  STF_READ  = 1 << 0,
+  STF_WRITE = 1 << 1,
+  STF_RW    = STF_READ | STF_WRITE
+} stf_access_mode;
+
+struct stf_exec_place_device
+{
+  int dev_id;
+};
+
+struct stf_exec_place_host
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+typedef enum stf_exec_place_kind
+{
+  STF_EXEC_PLACE_DEVICE,
+  STF_EXEC_PLACE_HOST
+} stf_exec_place_kind;
+
+struct stf_exec_place
+{
+  enum stf_exec_place_kind kind;
+  union
+  {
+    struct stf_exec_place_device device;
+    struct stf_exec_place_host host;
+  } u;
+};
+
+static inline struct stf_exec_place make_device_place(int dev_id)
+{
+  struct stf_exec_place p;
+  p.kind            = STF_EXEC_PLACE_DEVICE;
+  p.u.device.dev_id = dev_id;
+  return p;
+}
+
+static inline struct stf_exec_place make_host_place()
+{
+  struct stf_exec_place p;
+  p.kind         = STF_EXEC_PLACE_HOST;
+  p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+typedef struct stf_exec_place_device stf_exec_place_device;
+typedef struct stf_exec_place_host stf_exec_place_host;
+typedef union stf_exec_place_u stf_exec_place_u;
+typedef struct stf_exec_place stf_exec_place;
+
+struct stf_data_place_device
+{
+  int dev_id;
+};
+
+struct stf_data_place_host
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+struct stf_data_place_managed
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+struct stf_data_place_affine
+{
+  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+};
+
+typedef enum stf_data_place_kind
+{
+  STF_DATA_PLACE_DEVICE,
+  STF_DATA_PLACE_HOST,
+  STF_DATA_PLACE_MANAGED,
+  STF_DATA_PLACE_AFFINE
+} stf_data_place_kind;
+
+struct stf_data_place
+{
+  enum stf_data_place_kind kind;
+  union
+  {
+    struct stf_data_place_device device;
+    struct stf_data_place_host host;
+    struct stf_data_place_managed managed;
+    struct stf_data_place_affine affine;
+  } u;
+};
+
+static inline struct stf_data_place make_device_data_place(int dev_id)
+{
+  struct stf_data_place p;
+  p.kind            = STF_DATA_PLACE_DEVICE;
+  p.u.device.dev_id = dev_id;
+  return p;
+}
+
+static inline struct stf_data_place make_host_data_place()
+{
+  struct stf_data_place p;
+  p.kind         = STF_DATA_PLACE_HOST;
+  p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+static inline struct stf_data_place make_managed_data_place()
+{
+  struct stf_data_place p;
+  p.kind            = STF_DATA_PLACE_MANAGED;
+  p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+static inline struct stf_data_place make_affine_data_place()
+{
+  struct stf_data_place p;
+  p.kind           = STF_DATA_PLACE_AFFINE;
+  p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */
+  return p;
+}
+
+typedef struct stf_data_place_device stf_data_place_device;
+typedef struct stf_data_place_host stf_data_place_host;
+typedef struct stf_data_place_managed stf_data_place_managed;
+typedef struct stf_data_place_affine stf_data_place_affine;
+typedef union stf_data_place_u stf_data_place_u;
+typedef struct stf_data_place stf_data_place;
+
+typedef struct stf_ctx_handle_t* stf_ctx_handle;
+
+void stf_ctx_create(stf_ctx_handle* ctx);
+// TODO stf_ctx_create_with_flags and an enum instead ?
+void stf_ctx_create_graph(stf_ctx_handle* ctx);
+void stf_ctx_finalize(stf_ctx_handle ctx);
+
+// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ...
+// TODO stf_ctx_is_graph()
+
+cudaStream_t stf_fence(stf_ctx_handle ctx);
+
+typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
+
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
+void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
+void stf_logical_data_destroy(stf_logical_data_handle ld);
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to);
+
+// TODO
+// void stf_logical_data_wait(stf_logical_data_handle ld);
+
+void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
+
+typedef struct stf_task_handle_t* stf_task_handle;
+
+void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
+void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p);
+void stf_task_set_symbol(stf_task_handle t, const char* symbol);
+void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
+void stf_task_add_dep_with_dplace(
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p);
+void stf_task_start(stf_task_handle t);
+void stf_task_end(stf_task_handle t);
+CUstream stf_task_get_custream(stf_task_handle t);
+void* stf_task_get(stf_task_handle t, int submitted_index);
+void stf_task_destroy(stf_task_handle t);
+void stf_task_enable_capture(stf_task_handle t);
+
+typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
+
+void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p);
+void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
+void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
+void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
+
+void stf_cuda_kernel_add_desc_cufunc(
+  stf_cuda_kernel_handle k,
+  CUfunction cufunc,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args);
+
+/* Convert CUDA kernel address to CUfunction because we may use them from a
+ * shared library where this would be invalid in the runtime API. */
+static inline void stf_cuda_kernel_add_desc(
+  stf_cuda_kernel_handle k,
+  const void* func,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args)
+{
+  CUfunction cufunc;
+  [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
+  assert(res == cudaSuccess);
+
+  stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+}
+
+void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index);
+void stf_cuda_kernel_end(stf_cuda_kernel_handle k);
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
new file mode 100644
index 00000000000..0a92d86b677
--- /dev/null
+++ b/c/experimental/stf/src/stf.cu
@@ -0,0 +1,297 @@
+#include <cccl/c/experimental/stf/stf.h>
+// #include <cccl/c/parallel/include/cccl/c/extern_c.h>
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+extern "C" {
+
+struct stf_ctx_handle_t
+{
+  context ctx;
+};
+
+struct stf_logical_data_handle_t
+{
+  // XXX should we always store a logical_data<slice<char>> instead ?
+  logical_data_untyped ld;
+};
+
+struct stf_task_handle_t
+{
+  context::unified_task<> t;
+};
+
+void stf_ctx_create(stf_ctx_handle* ctx)
+{
+  assert(ctx);
+  *ctx = new stf_ctx_handle_t{context{}};
+}
+
+void stf_ctx_create_graph(stf_ctx_handle* ctx)
+{
+  assert(ctx);
+  *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
+}
+
+void stf_ctx_finalize(stf_ctx_handle ctx)
+{
+  ctx->ctx.finalize();
+  assert(ctx);
+  delete ctx;
+}
+
+cudaStream_t stf_fence(stf_ctx_handle ctx)
+{
+  assert(ctx);
+  return ctx->ctx.fence();
+}
+
+void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
+{
+  assert(ld);
+  assert(ctx);
+
+  // Create a slice<char> logical data
+  auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz));
+
+  // Stored in its untyped version
+  *ld = new stf_logical_data_handle_t{ld_typed};
+}
+
+void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
+{
+  assert(ld);
+  ld->ld.set_symbol(symbol);
+}
+
+void stf_logical_data_destroy(stf_logical_data_handle ld)
+{
+  assert(ld);
+  delete ld;
+}
+
+void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to)
+{
+  assert(ctx);
+  assert(to);
+
+  auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
+  *to           = new stf_logical_data_handle_t{ld_typed};
+}
+
+void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
+{
+  assert(ctx);
+  assert(ld);
+
+  *ld = new stf_logical_data_handle_t{ctx->ctx.token()};
+}
+
+/* Convert the C-API stf_exec_place to a C++ exec_place object */
+exec_place to_exec_place(struct stf_exec_place* exec_p)
+{
+  if (exec_p->kind == STF_EXEC_PLACE_HOST)
+  {
+    return exec_place::host();
+  }
+
+  assert(exec_p->kind == STF_EXEC_PLACE_DEVICE);
+  return exec_place::device(exec_p->u.device.dev_id);
+}
+
+/* Convert the C-API stf_data_place to a C++ data_place object */
+data_place to_data_place(struct stf_data_place* data_p)
+{
+  assert(data_p);
+
+  if (data_p->kind == STF_DATA_PLACE_HOST)
+  {
+    return data_place::host();
+  }
+
+  if (data_p->kind == STF_DATA_PLACE_MANAGED)
+  {
+    return data_place::managed();
+  }
+
+  if (data_p->kind == STF_DATA_PLACE_AFFINE)
+  {
+    return data_place::affine();
+  }
+
+  assert(data_p->kind == STF_DATA_PLACE_DEVICE);
+  return data_place::device(data_p->u.device.dev_id);
+}
+
+void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
+{
+  assert(t);
+  assert(ctx);
+
+  *t = new stf_task_handle_t{ctx->ctx.task()};
+}
+
+void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p)
+{
+  assert(t);
+  t->t.set_exec_place(to_exec_place(exec_p));
+}
+
+void stf_task_set_symbol(stf_task_handle t, const char* symbol)
+{
+  assert(t);
+  assert(symbol);
+
+  t->t.set_symbol(symbol);
+}
+
+void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
+{
+  assert(t);
+  assert(ld);
+
+  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+}
+
+void stf_task_add_dep_with_dplace(
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p)
+{
+  assert(t);
+  assert(ld);
+  assert(data_p);
+
+  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p)));
+}
+
+void* stf_task_get(stf_task_handle t, int index)
+{
+  assert(t);
+  auto s = t->t.template get<slice<const char>>(index);
+  return (void*) s.data_handle();
+}
+
+void stf_task_start(stf_task_handle t)
+{
+  assert(t);
+  t->t.start();
+}
+
+void stf_task_end(stf_task_handle t)
+{
+  assert(t);
+  t->t.end();
+}
+
+void stf_task_enable_capture(stf_task_handle t)
+{
+  assert(t);
+  t->t.enable_capture();
+}
+
+CUstream stf_task_get_custream(stf_task_handle t)
+{
+  assert(t);
+  return (CUstream) t->t.get_stream();
+}
+
+void stf_task_destroy(stf_task_handle t)
+{
+  assert(t);
+  delete t;
+}
+
+/**
+ * Low level example of cuda_kernel(_chain)
+ *   auto t = ctx.cuda_kernel_chain();
+     t.add_deps(lX.read());
+     t.add_deps(lY.rw());
+     t->*[&]() {
+     auto dX = t.template get<slice<double>>(0);
+     auto dY = t.template get<slice<double>>(1);
+     return std::vector<cuda_kernel_desc> {
+         { axpy, 16, 128, 0, alpha, dX, dY },
+         { axpy, 16, 128, 0, beta, dX, dY },
+         { axpy, 16, 128, 0, gamma, dX, dY }
+     };
+  };
+
+ *
+ */
+struct stf_cuda_kernel_handle_t
+{
+  // return type of ctx.cuda_kernel()
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  kernel_type k;
+};
+
+void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
+{
+  assert(k);
+  assert(ctx);
+
+  *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
+}
+
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p)
+{
+  assert(k);
+  k->k.set_exec_place(to_exec_place(exec_p));
+}
+
+void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
+{
+  assert(k);
+  assert(symbol);
+
+  k->k.set_symbol(symbol);
+}
+
+void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m)
+{
+  assert(k);
+  assert(ld);
+
+  k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+}
+
+void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
+{
+  assert(k);
+  k->k.start();
+}
+
+void stf_cuda_kernel_add_desc_cufunc(
+  stf_cuda_kernel_handle k,
+  CUfunction cufunc,
+  dim3 gridDim_,
+  dim3 blockDim_,
+  size_t sharedMem_,
+  int arg_cnt,
+  const void** args)
+{
+  cuda_kernel_desc desc;
+  desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+
+  k->k.add_kernel_desc(mv(desc));
+}
+
+void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index)
+{
+  auto s = k->k.template get<slice<const char>>(index);
+  return (void*) s.data_handle();
+}
+
+void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
+{
+  assert(k);
+  k->k.end();
+}
+
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
+{
+  assert(t);
+  delete t;
+}
+
+} // extern "C"
diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt
new file mode 100644
index 00000000000..f5613253a81
--- /dev/null
+++ b/c/experimental/stf/test/CMakeLists.txt
@@ -0,0 +1,39 @@
+cccl_get_c2h()
+
+function(cccl_c_experimental_stf_add_test target_name_var source)
+  string(REGEX REPLACE "test_([^.]*)" "cccl.c.experimental.stf.test.\\1" target_name "${source}")
+  set(target_name_var ${target_name} PARENT_SCOPE)
+
+  add_executable(${target_name} "${source}")
+  cccl_configure_target(${target_name} DIALECT 20)
+
+  set_target_properties(${target_name} PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+  target_link_libraries(${target_name} PRIVATE
+    cccl.c.experimental.stf
+    CUDA::cudart_static
+    CUDA::nvrtc
+    cccl.c2h.main
+    cccl.compiler_interface_cpp20
+    CUDA::cuda_driver
+    CCCL::cudax
+  )
+
+  target_compile_definitions(${target_name} PRIVATE
+    TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+    TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust"
+    TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+    TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
+  )
+
+  add_test(NAME ${target_name} COMMAND ${target_name})
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(test_src IN LISTS test_srcs)
+  cccl_c_experimental_stf_add_test(test_target "${test_src}")
+endforeach()
diff --git a/c/experimental/stf/test/test_ctx.cpp b/c/experimental/stf/test/test_ctx.cpp
new file mode 100644
index 00000000000..86225ad91c7
--- /dev/null
+++ b/c/experimental/stf/test/test_ctx.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("basic stf context", "[context]")
+{
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+  stf_ctx_finalize(ctx);
+}
diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
new file mode 100644
index 00000000000..b5ba66b0f3a
--- /dev/null
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+__global__ void axpy(int cnt, double a, const double* x, double* y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < cnt; i += nthreads)
+  {
+    y[i] += a * x[i];
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY;
+
+  double *X, *Y;
+  X = (double*) malloc(N * sizeof(double));
+  Y = (double*) malloc(N * sizeof(double));
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  const double alpha = 3.14;
+
+  stf_logical_data(ctx, &lX, X, N * sizeof(double));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(double));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+
+  stf_cuda_kernel_handle k;
+  stf_cuda_kernel_create(ctx, &k);
+  stf_cuda_kernel_set_symbol(k, "axpy");
+  stf_cuda_kernel_add_dep(k, lX, STF_READ);
+  stf_cuda_kernel_add_dep(k, lY, STF_RW);
+  stf_cuda_kernel_start(k);
+  double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
+  double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
+  const void* args[4] = {&N, &alpha, &dX, &dY};
+  stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  stf_cuda_kernel_end(k);
+  stf_cuda_kernel_destroy(k);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+
+  stf_ctx_finalize(ctx);
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+
+  free(X);
+  free(Y);
+}
diff --git a/c/experimental/stf/test/test_logical_data.cpp b/c/experimental/stf/test/test_logical_data.cpp
new file mode 100644
index 00000000000..168ca8dabbc
--- /dev/null
+++ b/c/experimental/stf/test/test_logical_data.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("basic stf logical_data", "[logical_data]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lA, lB;
+
+  float *A, *B;
+  A = (float*) malloc(N * sizeof(float));
+  B = (float*) malloc(N * sizeof(float));
+
+  stf_logical_data(ctx, &lA, A, N * sizeof(float));
+  stf_logical_data(ctx, &lB, B, N * sizeof(float));
+
+  stf_logical_data_destroy(lA);
+  stf_logical_data_destroy(lB);
+
+  stf_ctx_finalize(ctx);
+
+  free(A);
+  free(B);
+}
diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
new file mode 100644
index 00000000000..eeba229c758
--- /dev/null
+++ b/c/experimental/stf/test/test_places.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("empty stf tasks", "[task]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
+
+  stf_logical_data(ctx, &lX, X, N * sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+  stf_logical_data(ctx, &lZ, Z, N * sizeof(float));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  auto e_place_dev0 = make_device_place(0);
+  stf_task_set_exec_place(t3, &e_place_dev0);
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  auto d_place_dev0 = make_device_data_place(0);
+  stf_task_add_dep_with_dplace(t4, lZ, STF_RW, &d_place_dev0);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}
diff --git a/c/experimental/stf/test/test_task.cpp b/c/experimental/stf/test/test_task.cpp
new file mode 100644
index 00000000000..80266f6b381
--- /dev/null
+++ b/c/experimental/stf/test/test_task.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("empty stf tasks", "[task]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
+
+  stf_logical_data(ctx, &lX, X, N * sizeof(float));
+  stf_logical_data(ctx, &lY, Y, N * sizeof(float));
+  stf_logical_data(ctx, &lZ, Z, N * sizeof(float));
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  stf_task_add_dep(t4, lZ, STF_RW);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}
diff --git a/c/experimental/stf/test/test_token.cpp b/c/experimental/stf/test/test_token.cpp
new file mode 100644
index 00000000000..ccd7f0a9e2c
--- /dev/null
+++ b/c/experimental/stf/test/test_token.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <c2h/catch2_test_helper.h>
+#include <cccl/c/experimental/stf/stf.h>
+
+C2H_TEST("stf token", "[token]")
+{
+  size_t N = 1000000;
+
+  stf_ctx_handle ctx;
+  stf_ctx_create(&ctx);
+
+  stf_logical_data_handle lX, lY, lZ;
+
+  float *X, *Y, *Z;
+  X = (float*) malloc(N * sizeof(float));
+  Y = (float*) malloc(N * sizeof(float));
+  Z = (float*) malloc(N * sizeof(float));
+
+  stf_token(ctx, &lX);
+  stf_token(ctx, &lY);
+  stf_token(ctx, &lZ);
+
+  stf_logical_data_set_symbol(lX, "X");
+  stf_logical_data_set_symbol(lY, "Y");
+  stf_logical_data_set_symbol(lZ, "Z");
+
+  stf_task_handle t1;
+  stf_task_create(ctx, &t1);
+  stf_task_set_symbol(t1, "T1");
+  stf_task_add_dep(t1, lX, STF_RW);
+  stf_task_start(t1);
+  stf_task_end(t1);
+
+  stf_task_handle t2;
+  stf_task_create(ctx, &t2);
+  stf_task_set_symbol(t2, "T2");
+  stf_task_add_dep(t2, lX, STF_READ);
+  stf_task_add_dep(t2, lY, STF_RW);
+  stf_task_start(t2);
+  stf_task_end(t2);
+
+  stf_task_handle t3;
+  stf_task_create(ctx, &t3);
+  stf_task_set_symbol(t3, "T3");
+  stf_task_add_dep(t3, lX, STF_READ);
+  stf_task_add_dep(t3, lZ, STF_RW);
+  stf_task_start(t3);
+  stf_task_end(t3);
+
+  stf_task_handle t4;
+  stf_task_create(ctx, &t4);
+  stf_task_set_symbol(t4, "T4");
+  stf_task_add_dep(t4, lY, STF_READ);
+  stf_task_add_dep(t4, lZ, STF_RW);
+  stf_task_start(t4);
+  stf_task_end(t4);
+
+  stf_logical_data_destroy(lX);
+  stf_logical_data_destroy(lY);
+  stf_logical_data_destroy(lZ);
+
+  stf_ctx_finalize(ctx);
+
+  free(X);
+  free(Y);
+  free(Z);
+}

From 522b630d7e652060970cc9efce3bbe22145985ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:25:39 +0200
Subject: [PATCH 140/221] remove dead code

---
 c/experimental/stf/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 11599edaec7..40f3fafffd7 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -46,7 +46,3 @@ target_include_directories(cccl.c.experimental.stf PRIVATE "src")
 if (CCCL_C_Parallel_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
-
-# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
-#   include(cmake/CParallelHeaderTesting.cmake)
-# endif()

From 43153141a20a1be102cc0c258a3566930fe229da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:29:28 +0200
Subject: [PATCH 141/221] do define and use
 CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING

---
 CMakePresets.json                 | 15 +++++++++++++--
 c/experimental/stf/CMakeLists.txt |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index b39ab345fc9..63f06ffa281 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -87,7 +87,8 @@
         "cudax_ENABLE_DIALECT_CPP17": true,
         "cudax_ENABLE_DIALECT_CPP20": true,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
-        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
     },
     {
@@ -270,7 +271,17 @@
       "cacheVariables": {
         "CCCL_ENABLE_C": true,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
-        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
+      }
+    },
+    {
+      "name": "cccl-c-stf",
+      "displayName": "CCCL C CUDASTF Library",
+      "inherits": "base",
+      "cacheVariables": {
+        "CCCL_ENABLE_C": true,
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
     },
     {
diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 40f3fafffd7..58e78427727 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -43,6 +43,6 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:C
 target_include_directories(cccl.c.experimental.stf PUBLIC "include")
 target_include_directories(cccl.c.experimental.stf PRIVATE "src")
 
-if (CCCL_C_Parallel_ENABLE_TESTING)
+if (CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING)
   add_subdirectory(test)
 endif()

From 48627aa88bb9251eb08408a58db7e955ccc2bd56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:36:47 +0200
Subject: [PATCH 142/221] Add CUDASTF C lib to tests

---
 ci/build_cccl_c_stf.sh | 15 +++++++++++++++
 ci/matrix.yaml         |  5 +++++
 ci/test_cccl_c_stf.sh  | 13 +++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100755 ci/build_cccl_c_stf.sh
 create mode 100755 ci/test_cccl_c_stf.sh

diff --git a/ci/build_cccl_c_stf.sh b/ci/build_cccl_c_stf.sh
new file mode 100755
index 00000000000..9fcc8559287
--- /dev/null
+++ b/ci/build_cccl_c_stf.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+
+source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh"
+
+print_environment_details
+
+PRESET="cccl-c-stf"
+
+CMAKE_OPTIONS=""
+
+configure_and_build_preset "CCCL C CUDASTF Library" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 3f003633eda..14529c9fcfd 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -54,6 +54,7 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
+    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']}
     # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
     # - {jobs: ['test'], project: 'python', ctk: ['12.5', 'curr'], py_version: ['3.10', '3.13'], gpu: 'l4'}
     # These two can be removed once the above is working:
@@ -115,6 +116,7 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang'],   gpu: 'rtx2080'}
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
+    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']}
     # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
     # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'}
     # These two can be removed once the above is working.
@@ -328,6 +330,9 @@ projects:
   cccl_c_parallel:
     name: 'CCCL C Parallel'
     stds: [20]
+  cccl_c_stf:
+    name: 'CCCL C CUDASTF'
+    stds: [20]
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
diff --git a/ci/test_cccl_c_stf.sh b/ci/test_cccl_c_stf.sh
new file mode 100755
index 00000000000..090e341292a
--- /dev/null
+++ b/ci/test_cccl_c_stf.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh"
+
+print_environment_details
+
+./build_cccl_c_stf.sh "$@"
+
+PRESET="cccl-c-stf"
+
+test_preset "CCCL C Parallel Library" ${PRESET}
+
+print_time_summary

From c87cdaa6414c1d7fb27e692553c79f8d51ffe4e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:50:29 +0200
Subject: [PATCH 143/221] Add missing headers

---
 .../stf/include/cccl/c/experimental/stf/stf.h    | 16 ++++++++++++++++
 c/experimental/stf/src/stf.cu                    | 10 ++++++++++
 2 files changed, 26 insertions(+)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 6f2f903e6c8..9ab82e1b213 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1,3 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
+
 #include <assert.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 0a92d86b677..e300d427105 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cccl/c/experimental/stf/stf.h>
 // #include <cccl/c/parallel/include/cccl/c/extern_c.h>
 #include <cuda/experimental/stf.cuh>

From 02a9eb6c5f50166f7816a3be03e069d9c1dd315d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 15:53:38 +0200
Subject: [PATCH 144/221] use snake_case

---
 .../stf/include/cccl/c/experimental/stf/stf.h      | 14 +++++++-------
 c/experimental/stf/src/stf.cu                      |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 9ab82e1b213..7768c132feb 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -207,9 +207,9 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args);
 
@@ -218,9 +218,9 @@ void stf_cuda_kernel_add_desc_cufunc(
 static inline void stf_cuda_kernel_add_desc(
   stf_cuda_kernel_handle k,
   const void* func,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args)
 {
@@ -228,7 +228,7 @@ static inline void stf_cuda_kernel_add_desc(
   [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
   assert(res == cudaSuccess);
 
-  stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+  stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
 }
 
 void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index e300d427105..6531f0d6289 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -274,14 +274,14 @@ void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args)
 {
   cuda_kernel_desc desc;
-  desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+  desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
 
   k->k.add_kernel_desc(mv(desc));
 }

From 232133b9600258881ea6e2eaf88e311470260fe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 16:26:46 +0200
Subject: [PATCH 145/221] Do define CCCL_C_EXPERIMENTAL=1

---
 c/experimental/stf/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 58e78427727..85f9bdb4c34 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()
 
 find_package(CUDAToolkit REQUIRED)
 set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cudart_static
   CUDA::nvrtc

From b60eb6b1006cc93c8c519271fa85adf5fbfc5e5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 16:49:36 +0200
Subject: [PATCH 146/221] Do not do redundant tests

---
 CMakePresets.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 63f06ffa281..876b02acc6d 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -272,7 +272,7 @@
         "CCCL_ENABLE_C": true,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
-        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false
       }
     },
     {
@@ -281,6 +281,8 @@
       "inherits": "base",
       "cacheVariables": {
         "CCCL_ENABLE_C": true,
+        "CCCL_C_Parallel_ENABLE_TESTING": false,
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false,
         "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
     },

From c4c99f01474ac6e46466438257d1d5c29cfa27c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 16:54:46 +0200
Subject: [PATCH 147/221] Add a project to ci/inspect_changes.sh

---
 ci/inspect_changes.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
index 4ccf4a6fc70..7c7dfab4580 100755
--- a/ci/inspect_changes.sh
+++ b/ci/inspect_changes.sh
@@ -63,6 +63,7 @@ declare -A dependencies=(
   [stdpar]="cccl libcudacxx cub thrust"
   [python]="cccl libcudacxx cub cccl_c_parallel"
   [cccl_c_parallel]="cccl libcudacxx cub thrust c2h"
+  [cccl_c_stf]="cccl libcudacxx cudax c2h"
   [c2h]="cccl libcudacxx cub thrust"
   [nvbench_helper]="cccl libcudacxx cub thrust"
 )
@@ -77,6 +78,7 @@ declare -A project_names=(
   [stdpar]="stdpar"
   [python]="python"
   [cccl_c_parallel]="CCCL C Parallel Library"
+  [cccl_c_stf]="CCCL C CUDASTF Library"
   [c2h]="Catch2Helper"
   [nvbench_helper]="NVBench Helper"
 )

From 2f5925b166962cf7236ad2ee789cf5b738d7cd50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 16:59:50 +0200
Subject: [PATCH 148/221] missing changes in previous commit

---
 ci/inspect_changes.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
index 7c7dfab4580..f48b82270bd 100755
--- a/ci/inspect_changes.sh
+++ b/ci/inspect_changes.sh
@@ -47,6 +47,7 @@ subprojects=(
   stdpar
   python
   cccl_c_parallel
+  cccl_c_stf
   c2h
   nvbench_helper
 )
@@ -92,6 +93,7 @@ declare -A project_names=(
 declare -A project_dirs=(
   [packaging]='("examples" "test/cmake")'
   [cccl_c_parallel]='("c/parallel")'
+  [cccl_c_stf]='("c/experimental/stf")'
   [stdpar]='("test/stdpar")'
 )
 

From 3417075f27c17d76b1378713d08ab8d55dffd282 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 2 Sep 2025 17:11:31 +0200
Subject: [PATCH 149/221] add presets

---
 CMakePresets.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakePresets.json b/CMakePresets.json
index 876b02acc6d..1fe5025ba4d 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -439,6 +439,10 @@
       "name": "cccl-c-parallel",
       "configurePreset": "cccl-c-parallel"
     },
+    {
+      "name": "cccl-c-stf",
+      "configurePreset": "cccl-c-stf"
+    },
     {
       "name": "packaging",
       "configurePreset": "packaging"
@@ -732,6 +736,11 @@
       "configurePreset": "cccl-c-parallel",
       "inherits": "base"
     },
+    {
+      "name": "cccl-c-stf",
+      "configurePreset": "cccl-c-stf",
+      "inherits": "base"
+    },
     {
       "name": "packaging",
       "configurePreset": "packaging",

From 8c05034efad0460969f701bd92aa1b13a9072b7c Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 2 Sep 2025 15:32:04 +0000
Subject: [PATCH 150/221] Add override matrix

---
 ci/matrix.yaml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 14529c9fcfd..d1ed0fb08b1 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,6 +8,11 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
+    # Python and c/parallel jobs:
+    - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']}
+    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']}
+    - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
+    - {jobs: ['test_py_par'],                                         ctk: ['12.8', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
 
   pull_request:
     # Old CTK/compiler
@@ -54,7 +59,7 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
-    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']}
+    - {jobs: ['test'], project: ['cccl_c_stf'],      gpu: ['rtx2080', 'l4', 'h100']}
     # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
     # - {jobs: ['test'], project: 'python', ctk: ['12.5', 'curr'], py_version: ['3.10', '3.13'], gpu: 'l4'}
     # These two can be removed once the above is working:
@@ -116,7 +121,7 @@ workflows:
     - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang'],   gpu: 'rtx2080'}
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['rtx2080', 'l4', 'h100']}
-    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['rtx2080', 'l4', 'h100']}
+    - {jobs: ['test'], project: ['cccl_c_stf'],      gpu: ['rtx2080', 'l4', 'h100']}
     # TODO Just need this line once cccl.parallel tests pass on 12.5 and 12.6:
     # - {jobs: ['test'], project: 'python', ctk: ['12.5', '12.6', '12.8', '12.9'], py_version: ['3.10', '3.11', '3.12', '3.13'], gpu: 'l4'}
     # These two can be removed once the above is working.

From 20faa8ff744f9320871e32cc70d711b04766ffd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 08:34:20 +0200
Subject: [PATCH 151/221] Properly define structs with a typedef and remove
 superfluous struct keywords

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 80 ++++++++-----------
 c/experimental/stf/src/stf.cu                 | 10 +--
 2 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 7768c132feb..357baff4ed8 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -31,15 +31,15 @@ typedef enum stf_access_mode
   STF_RW    = STF_READ | STF_WRITE
 } stf_access_mode;
 
-struct stf_exec_place_device
+typedef struct stf_exec_place_device
 {
   int dev_id;
-};
+} stf_exec_place_device;
 
-struct stf_exec_place_host
+typedef struct stf_exec_place_host
 {
   char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+} stf_exec_place_host;
 
 typedef enum stf_exec_place_kind
 {
@@ -47,56 +47,51 @@ typedef enum stf_exec_place_kind
   STF_EXEC_PLACE_HOST
 } stf_exec_place_kind;
 
-struct stf_exec_place
+typedef struct stf_exec_place
 {
   enum stf_exec_place_kind kind;
   union
   {
-    struct stf_exec_place_device device;
-    struct stf_exec_place_host host;
+    stf_exec_place_device device;
+    stf_exec_place_host host;
   } u;
-};
+} stf_exec_place;
 
-static inline struct stf_exec_place make_device_place(int dev_id)
+static inline stf_exec_place make_device_place(int dev_id)
 {
-  struct stf_exec_place p;
+  stf_exec_place p;
   p.kind            = STF_EXEC_PLACE_DEVICE;
   p.u.device.dev_id = dev_id;
   return p;
 }
 
-static inline struct stf_exec_place make_host_place()
+static inline stf_exec_place make_host_place()
 {
-  struct stf_exec_place p;
+  stf_exec_place p;
   p.kind         = STF_EXEC_PLACE_HOST;
   p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
-typedef struct stf_exec_place_device stf_exec_place_device;
-typedef struct stf_exec_place_host stf_exec_place_host;
-typedef union stf_exec_place_u stf_exec_place_u;
-typedef struct stf_exec_place stf_exec_place;
-
-struct stf_data_place_device
+typedef struct stf_data_place_device
 {
   int dev_id;
-};
+} stf_data_place_device;
 
-struct stf_data_place_host
+typedef struct stf_data_place_host
 {
   char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+} stf_data_place_host;
 
-struct stf_data_place_managed
+typedef struct stf_data_place_managed
 {
   char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+} stf_data_place_managed;
 
-struct stf_data_place_affine
+typedef struct stf_data_place_affine
 {
   char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+} stf_data_place_affine;
 
 typedef enum stf_data_place_kind
 {
@@ -106,21 +101,21 @@ typedef enum stf_data_place_kind
   STF_DATA_PLACE_AFFINE
 } stf_data_place_kind;
 
-struct stf_data_place
+typedef struct stf_data_place
 {
   enum stf_data_place_kind kind;
   union
   {
-    struct stf_data_place_device device;
-    struct stf_data_place_host host;
-    struct stf_data_place_managed managed;
-    struct stf_data_place_affine affine;
+    stf_data_place_device device;
+    stf_data_place_host host;
+    stf_data_place_managed managed;
+    stf_data_place_affine affine;
   } u;
-};
+} stf_data_place;
 
-static inline struct stf_data_place make_device_data_place(int dev_id)
+static inline stf_data_place make_device_data_place(int dev_id)
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind            = STF_DATA_PLACE_DEVICE;
   p.u.device.dev_id = dev_id;
   return p;
@@ -128,7 +123,7 @@ static inline struct stf_data_place make_device_data_place(int dev_id)
 
 static inline struct stf_data_place make_host_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind         = STF_DATA_PLACE_HOST;
   p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
@@ -136,7 +131,7 @@ static inline struct stf_data_place make_host_data_place()
 
 static inline struct stf_data_place make_managed_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind            = STF_DATA_PLACE_MANAGED;
   p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
@@ -144,19 +139,12 @@ static inline struct stf_data_place make_managed_data_place()
 
 static inline struct stf_data_place make_affine_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind           = STF_DATA_PLACE_AFFINE;
   p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
-typedef struct stf_data_place_device stf_data_place_device;
-typedef struct stf_data_place_host stf_data_place_host;
-typedef struct stf_data_place_managed stf_data_place_managed;
-typedef struct stf_data_place_affine stf_data_place_affine;
-typedef union stf_data_place_u stf_data_place_u;
-typedef struct stf_data_place stf_data_place;
-
 typedef struct stf_ctx_handle_t* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
@@ -184,11 +172,11 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 typedef struct stf_task_handle_t* stf_task_handle;
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
-void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p);
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
 void stf_task_add_dep_with_dplace(
-  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p);
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p);
 void stf_task_start(stf_task_handle t);
 void stf_task_end(stf_task_handle t);
 CUstream stf_task_get_custream(stf_task_handle t);
@@ -199,7 +187,7 @@ void stf_task_enable_capture(stf_task_handle t);
 typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p);
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 6531f0d6289..60a24710829 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -99,7 +99,7 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 }
 
 /* Convert the C-API stf_exec_place to a C++ exec_place object */
-exec_place to_exec_place(struct stf_exec_place* exec_p)
+exec_place to_exec_place(stf_exec_place* exec_p)
 {
   if (exec_p->kind == STF_EXEC_PLACE_HOST)
   {
@@ -111,7 +111,7 @@ exec_place to_exec_place(struct stf_exec_place* exec_p)
 }
 
 /* Convert the C-API stf_data_place to a C++ data_place object */
-data_place to_data_place(struct stf_data_place* data_p)
+data_place to_data_place(stf_data_place* data_p)
 {
   assert(data_p);
 
@@ -142,7 +142,7 @@ void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
   *t = new stf_task_handle_t{ctx->ctx.task()};
 }
 
-void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p)
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
 {
   assert(t);
   t->t.set_exec_place(to_exec_place(exec_p));
@@ -165,7 +165,7 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
 }
 
 void stf_task_add_dep_with_dplace(
-  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p)
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p)
 {
   assert(t);
   assert(ld);
@@ -243,7 +243,7 @@ void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
   *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
 }
 
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p)
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p)
 {
   assert(k);
   k->k.set_exec_place(to_exec_place(exec_p));

From 8c5e760326a6c2731f4c5b8b34ef0e42c63d37db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 09:29:55 +0200
Subject: [PATCH 152/221] fix previous merge

---
 ci/matrix.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 3d9c82c5ce2..c03ffe5e9cc 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -11,8 +11,6 @@ workflows:
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']}
     - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']}
-    - {jobs: ['test_py_headers', 'test_py_coop', 'test_py_examples'], ctk: ['12.5', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
-    - {jobs: ['test_py_par'],                                         ctk: ['12.8', 'curr'], project: 'python', py_version: ['3.10', '3.13'], gpu: 'l4'}
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:

From 78dc197d8bd9f284f0827433b6024459946691c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 10:18:00 +0200
Subject: [PATCH 153/221] Change tensor_arguments to return an element instead
 of a tuple of one element, and use this feature in examples. Also add the
 equivalent for NUMBA and use it in examples

---
 .../cccl/experimental/stf/_stf_bindings_impl.pyx |  8 ++++++++
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py  |  6 +++---
 python/cuda_cccl/tests/stf/test_fhe.py           | 16 +++++-----------
 python/cuda_cccl/tests/stf/test_fhe_decorator.py |  2 +-
 python/cuda_cccl/tests/stf/test_numba.py         | 12 +++++-------
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 3b4ba4db39b..ceca943e155 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -441,6 +441,12 @@ cdef class task:
             raise RuntimeError("numba support is not available") from e
         return cai_to_numba(cai)
 
+    def numba_arguments(self):
+        arg_cnt=len(self._lds_args)
+        if arg_cnt == 1:
+            return self.get_arg_numba(0)
+        return tuple(self.get_arg_numba(i) for i in range(arg_cnt))
+
     def get_arg_as_tensor(self, index):
         cai = self.get_arg_cai(index)
         try:
@@ -451,6 +457,8 @@ cdef class task:
 
     def tensor_arguments(self):
         arg_cnt=len(self._lds_args)
+        if arg_cnt == 1:
+            return self.get_arg_as_tensor(0)
         return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt))
 
     # ---- context‑manager helpers -------------------------------
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index ccac389d6a6..3d1d6b0d2df 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -49,7 +49,7 @@ def init_field(ctx, ld, value):
         ctx.task(ld.write()) as t,
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
-        field = t.get_arg_as_tensor(0)
+        field = t.tensor_arguments()
         if value == 0:
             field.zero_()
         else:
@@ -173,7 +173,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
             ctx.task(lez.rw()) as t,
             torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
         ):
-            ez = t.get_arg_as_tensor(0)
+            ez = t.tensor_arguments()
             ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
 
         # -------------------------
@@ -222,7 +222,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
                 ctx.task(lez.read()) as t,
                 torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
             ):
-                ez = t.get_arg_as_tensor(0)
+                ez = t.tensor_arguments()
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
                 show_slice(ez, plane="xy")
             pass
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 9ec86af51c8..e2d38308341 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -37,7 +37,7 @@ def print_values(self):
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            hvalues = t.get_arg_numba(0)
+            hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
 
@@ -85,8 +85,7 @@ def __invert__(self):
 
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            da = t.get_arg_numba(0)
-            dresult = t.get_arg_numba(1)
+            da, dresult = t.numba_arguments()
             not_kernel[32, 16, nb_stream](da, dresult)
 
         return result
@@ -100,9 +99,7 @@ def __or__(self, other):
 
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            da = t.get_arg_numba(0)
-            db = t.get_arg_numba(1)
-            dresult = t.get_arg_numba(2)
+            da, db, dresult = t.numba_arguments()
             or_kernel[32, 16, nb_stream](da, db, dresult)
 
         return result
@@ -117,9 +114,7 @@ def __and__(self, other):
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             nb_stream.synchronize()
-            da = t.get_arg_numba(0)
-            db = t.get_arg_numba(1)
-            dresult = t.get_arg_numba(2)
+            da, db, dresult = t.numba_arguments()
             and_kernel[32, 16, nb_stream](da, db, dresult)
 
         return result
@@ -133,8 +128,7 @@ def decrypt(self):
 
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            da = t.get_arg_numba(0)
-            dresult = t.get_arg_numba(1)
+            da, dresult = t.numba_arguments()
             # reverse the toy XOR "encryption"
             xor_kernel[32, 16, nb_stream](da, dresult, 0x42)
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index bb369b6f250..8adbf5454ed 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -37,7 +37,7 @@ def print_values(self):
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            hvalues = t.get_arg_numba(0)
+            hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 35fb749c68c..a992d26d7d2 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -42,7 +42,7 @@ def test_numba_graph():
     lX = ctx.logical_data(X)
     with ctx.task(rw(lX)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        dX = t.get_arg_numba(0)
+        dX = t.numba_arguments()
         scale[32, 64, nb_stream](2.0, dX)
 
     ctx.finalize()
@@ -61,7 +61,7 @@ def test_numba():
 
     with ctx.task(rw(lX)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        dX = t.get_arg_numba(0)
+        dX = t.numba_arguments()
         # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
 
@@ -74,14 +74,12 @@ def test_numba():
 
     with ctx.task(read(lX), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        dX = t.get_arg_numba(0)
-        dZ = t.get_arg_numba(1)
+        dX, dZ = t.numba_arguments()
         axpy[32, 64, nb_stream](2.0, dX, dZ)
 
     with ctx.task(read(lY), rw(lZ)) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        dY = t.get_arg_numba(0)
-        dZ = t.get_arg_numba(1)
+        dY, dZ = t.numba_arguments()
         axpy[32, 64, nb_stream](2.0, dY, dZ)
 
     ctx.finalize()
@@ -217,7 +215,7 @@ def test_numba_places():
 
     with ctx.task(lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        dX = t.get_arg_numba(0)
+        dX = t.numba_arguments()
         scale[32, 64, nb_stream](2.0, dX)
 
     with ctx.task(lX.read(), lY.rw()) as t:

From 2eb2ace7025cc3ca6142ce664168c5de0b2c340e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 16:23:26 +0200
Subject: [PATCH 154/221] Remove intermediate structures and use opaque
 pointers instead

---
 .../stf/include/cccl/c/experimental/stf/stf.h |   8 +-
 c/experimental/stf/src/stf.cu                 | 157 +++++++++++-------
 2 files changed, 100 insertions(+), 65 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 357baff4ed8..58f6f3c8492 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -145,7 +145,7 @@ static inline struct stf_data_place make_affine_data_place()
   return p;
 }
 
-typedef struct stf_ctx_handle_t* stf_ctx_handle;
+typedef void* stf_ctx_handle;
 
 void stf_ctx_create(stf_ctx_handle* ctx);
 // TODO stf_ctx_create_with_flags and an enum instead ?
@@ -157,7 +157,7 @@ void stf_ctx_finalize(stf_ctx_handle ctx);
 
 cudaStream_t stf_fence(stf_ctx_handle ctx);
 
-typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
+typedef void* stf_logical_data_handle;
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
@@ -169,7 +169,7 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 
-typedef struct stf_task_handle_t* stf_task_handle;
+typedef void* stf_task_handle;
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
 void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
@@ -184,7 +184,7 @@ void* stf_task_get(stf_task_handle t, int submitted_index);
 void stf_task_destroy(stf_task_handle t);
 void stf_task_enable_capture(stf_task_handle t);
 
-typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
+typedef void* stf_cuda_kernel_handle;
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
 void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 60a24710829..a2303a1f5a0 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -16,69 +16,60 @@ using namespace cuda::experimental::stf;
 
 extern "C" {
 
-struct stf_ctx_handle_t
-{
-  context ctx;
-};
-
-struct stf_logical_data_handle_t
-{
-  // XXX should we always store a logical_data<slice<char>> instead ?
-  logical_data_untyped ld;
-};
-
-struct stf_task_handle_t
-{
-  context::unified_task<> t;
-};
-
 void stf_ctx_create(stf_ctx_handle* ctx)
 {
   assert(ctx);
-  *ctx = new stf_ctx_handle_t{context{}};
+  *ctx = new context{};
 }
 
 void stf_ctx_create_graph(stf_ctx_handle* ctx)
 {
   assert(ctx);
-  *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
+  *ctx = new context{graph_ctx()};
 }
 
 void stf_ctx_finalize(stf_ctx_handle ctx)
 {
-  ctx->ctx.finalize();
   assert(ctx);
-  delete ctx;
+  auto* context_ptr = static_cast<context*>(ctx);
+  context_ptr->finalize();
+  delete context_ptr;
 }
 
 cudaStream_t stf_fence(stf_ctx_handle ctx)
 {
   assert(ctx);
-  return ctx->ctx.fence();
+  auto* context_ptr = static_cast<context*>(ctx);
+  return context_ptr->fence();
 }
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
 {
-  assert(ld);
   assert(ctx);
+  assert(ld);
 
-  // Create a slice<char> logical data
-  auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz));
+  auto* context_ptr = static_cast<context*>(ctx);
+  auto ld_typed     = context_ptr->logical_data(make_slice((char*) addr, sz));
 
-  // Stored in its untyped version
-  *ld = new stf_logical_data_handle_t{ld_typed};
+  // Store the logical_data_untyped directly as opaque pointer
+  *ld = new logical_data_untyped{ld_typed};
 }
 
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
 {
   assert(ld);
-  ld->ld.set_symbol(symbol);
+  assert(symbol);
+
+  auto* ld_ptr = static_cast<logical_data_untyped*>(ld);
+  ld_ptr->set_symbol(symbol);
 }
 
 void stf_logical_data_destroy(stf_logical_data_handle ld)
 {
   assert(ld);
-  delete ld;
+
+  auto* ld_ptr = static_cast<logical_data_untyped*>(ld);
+  delete ld_ptr;
 }
 
 void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to)
@@ -86,8 +77,9 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_
   assert(ctx);
   assert(to);
 
-  auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
-  *to           = new stf_logical_data_handle_t{ld_typed};
+  auto* context_ptr = static_cast<context*>(ctx);
+  auto ld_typed     = context_ptr->logical_data(shape_of<slice<char>>(length));
+  *to               = new logical_data_untyped{ld_typed};
 }
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
@@ -95,7 +87,8 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
   assert(ctx);
   assert(ld);
 
-  *ld = new stf_logical_data_handle_t{ctx->ctx.token()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  *ld               = new logical_data_untyped{context_ptr->token()};
 }
 
 /* Convert the C-API stf_exec_place to a C++ exec_place object */
@@ -136,16 +129,20 @@ data_place to_data_place(stf_data_place* data_p)
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
 {
-  assert(t);
   assert(ctx);
+  assert(t);
 
-  *t = new stf_task_handle_t{ctx->ctx.task()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  *t                = new context::unified_task<>{context_ptr->task()};
 }
 
 void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
 {
   assert(t);
-  t->t.set_exec_place(to_exec_place(exec_p));
+  assert(exec_p);
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->set_exec_place(to_exec_place(exec_p));
 }
 
 void stf_task_set_symbol(stf_task_handle t, const char* symbol)
@@ -153,7 +150,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol)
   assert(t);
   assert(symbol);
 
-  t->t.set_symbol(symbol);
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->set_symbol(symbol);
 }
 
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
@@ -161,7 +159,9 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
   assert(t);
   assert(ld);
 
-  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto* ld_ptr   = static_cast<logical_data_untyped*>(ld);
+  task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m)));
 }
 
 void stf_task_add_dep_with_dplace(
@@ -171,44 +171,58 @@ void stf_task_add_dep_with_dplace(
   assert(ld);
   assert(data_p);
 
-  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p)));
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto* ld_ptr   = static_cast<logical_data_untyped*>(ld);
+  task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m), to_data_place(data_p)));
 }
 
 void* stf_task_get(stf_task_handle t, int index)
 {
   assert(t);
-  auto s = t->t.template get<slice<const char>>(index);
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto s         = task_ptr->template get<slice<const char>>(index);
   return (void*) s.data_handle();
 }
 
 void stf_task_start(stf_task_handle t)
 {
   assert(t);
-  t->t.start();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->start();
 }
 
 void stf_task_end(stf_task_handle t)
 {
   assert(t);
-  t->t.end();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->end();
 }
 
 void stf_task_enable_capture(stf_task_handle t)
 {
   assert(t);
-  t->t.enable_capture();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->enable_capture();
 }
 
 CUstream stf_task_get_custream(stf_task_handle t)
 {
   assert(t);
-  return (CUstream) t->t.get_stream();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  return (CUstream) task_ptr->get_stream();
 }
 
 void stf_task_destroy(stf_task_handle t)
 {
   assert(t);
-  delete t;
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  delete task_ptr;
 }
 
 /**
@@ -228,25 +242,24 @@ void stf_task_destroy(stf_task_handle t)
 
  *
  */
-struct stf_cuda_kernel_handle_t
-{
-  // return type of ctx.cuda_kernel()
-  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
-  kernel_type k;
-};
-
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
 {
-  assert(k);
   assert(ctx);
+  assert(k);
 
-  *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  using kernel_type = decltype(context_ptr->cuda_kernel());
+  *k                = new kernel_type{context_ptr->cuda_kernel()};
 }
 
 void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p)
 {
   assert(k);
-  k->k.set_exec_place(to_exec_place(exec_p));
+  assert(exec_p);
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->set_exec_place(to_exec_place(exec_p));
 }
 
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
@@ -254,7 +267,9 @@ void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
   assert(k);
   assert(symbol);
 
-  k->k.set_symbol(symbol);
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->set_symbol(symbol);
 }
 
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m)
@@ -262,13 +277,19 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l
   assert(k);
   assert(ld);
 
-  k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  auto* ld_ptr      = static_cast<logical_data_untyped*>(ld);
+  kernel_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m)));
 }
 
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
 {
   assert(k);
-  k->k.start();
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->start();
 }
 
 void stf_cuda_kernel_add_desc_cufunc(
@@ -280,28 +301,42 @@ void stf_cuda_kernel_add_desc_cufunc(
   int arg_cnt,
   const void** args)
 {
+  assert(k);
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+
   cuda_kernel_desc desc;
   desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
-
-  k->k.add_kernel_desc(mv(desc));
+  kernel_ptr->add_kernel_desc(mv(desc));
 }
 
 void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index)
 {
-  auto s = k->k.template get<slice<const char>>(index);
+  assert(k);
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  auto s            = kernel_ptr->template get<slice<const char>>(index);
   return (void*) s.data_handle();
 }
 
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
 {
   assert(k);
-  k->k.end();
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->end();
 }
 
 void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
 {
   assert(t);
-  delete t;
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(t);
+  delete kernel_ptr;
 }
 
 } // extern "C"

From 655706726aa8e1b396b0792a6b20fd59c84f65b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 16:50:22 +0200
Subject: [PATCH 155/221] Automatically generated documentation

---
 .../stf/include/cccl/c/experimental/stf/stf.h | 974 +++++++++++++++++-
 1 file changed, 936 insertions(+), 38 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 58f6f3c8492..7e012830408 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -8,6 +8,61 @@
 //
 //===----------------------------------------------------------------------===//
 
+//! \file stf.h
+//! \brief CUDA STF (Sequential Task Flow) C Interface
+//!
+//! \details
+//! This header provides a C interface to the CUDA STF C++ library, enabling
+//! task-based parallel programming with automatic data movement and dependency management.
+//!
+//! The Sequential Task Flow programming model involves defining logical data and
+//! submitting tasks that operate on this data. STF automatically deduces dependencies
+//! between tasks and orchestrates both computation and data movement to ensure
+//! efficient execution with maximum concurrency.
+//!
+//! \par Key Concepts:
+//! - **Logical Data**: Abstract handles for data that may exist in multiple locations
+//! - **Tasks**: Operations that consume and produce logical data with specified access modes
+//! - **Dependencies**: Automatically inferred from data access patterns (RAW, WAR, WAW)
+//! - **Execution Places**: Specify where tasks run (CPU, specific GPU devices)
+//! - **Data Places**: Specify where data should be located in memory hierarchy
+//!
+//! \par Basic Usage Pattern:
+//! \code
+//! // 1. Create STF context
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//!
+//! // 2. Create logical data from arrays
+//! float X[1024], Y[1024];
+//! stf_logical_data_handle lX, lY;
+//! stf_logical_data(ctx, &lX, X, sizeof(X));
+//! stf_logical_data(ctx, &lY, Y, sizeof(Y));
+//!
+//! // 3. Create and configure task
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! stf_task_add_dep(task, lX, STF_READ);  // X is read-only
+//! stf_task_add_dep(task, lY, STF_RW);    // Y is read-write
+//!
+//! // 4. Execute task
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//! float* x_ptr = (float*)stf_task_get(task, 0);
+//! float* y_ptr = (float*)stf_task_get(task, 1);
+//! // ... launch CUDA operations using stream ...
+//! stf_task_end(task);
+//!
+//! // 5. Cleanup
+//! stf_ctx_finalize(ctx);
+//! stf_task_destroy(task);
+//! stf_logical_data_destroy(lX);
+//! stf_logical_data_destroy(lY);
+//! \endcode
+//!
+//! \warning This API is experimental and subject to change.
+//!          Define CCCL_C_EXPERIMENTAL to acknowledge this.
+
 #pragma once
 
 #ifndef CCCL_C_EXPERIMENTAL
@@ -18,45 +73,76 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+//! \defgroup AccessMode Data Access Modes
+//! \brief Specifies how tasks access logical data
+//! \{
+
+//! \brief Data access mode for task dependencies
+//!
+//! Specifies how a task will access logical data, which determines
+//! synchronization requirements and concurrency opportunities.
 typedef enum stf_access_mode
 {
-  STF_NONE  = 0,
-  STF_READ  = 1 << 0,
-  STF_WRITE = 1 << 1,
-  STF_RW    = STF_READ | STF_WRITE
+  STF_NONE  = 0, //!< No access (invalid)
+  STF_READ  = 1 << 0, //!< Read-only access - allows concurrent readers
+  STF_WRITE = 1 << 1, //!< Write-only access - requires exclusive access
+  STF_RW    = STF_READ | STF_WRITE //!< Read-write access - requires exclusive access
 } stf_access_mode;
 
+//! \}
+
+//! \defgroup ExecPlace Execution Places
+//! \brief Specify where tasks should execute
+//! \{
+
+//! \brief Device execution place configuration
 typedef struct stf_exec_place_device
 {
-  int dev_id;
+  int dev_id; //!< CUDA device ID (0-based)
 } stf_exec_place_device;
 
+//! \brief Host execution place configuration
 typedef struct stf_exec_place_host
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+  char dummy; //!< Dummy field for standard C compatibility
 } stf_exec_place_host;
 
+//! \brief Execution place type discriminator
 typedef enum stf_exec_place_kind
 {
-  STF_EXEC_PLACE_DEVICE,
-  STF_EXEC_PLACE_HOST
+  STF_EXEC_PLACE_DEVICE, //!< Task executes on CUDA device
+  STF_EXEC_PLACE_HOST //!< Task executes on host (CPU)
 } stf_exec_place_kind;
 
+//! \brief Execution place specification
+//!
+//! Tagged union specifying where a task should execute.
+//! Use helper functions make_device_place() and make_host_place() to create.
 typedef struct stf_exec_place
 {
-  enum stf_exec_place_kind kind;
+  enum stf_exec_place_kind kind; //!< Type of execution place
   union
   {
-    stf_exec_place_device device;
-    stf_exec_place_host host;
-  } u;
+    stf_exec_place_device device; //!< Device configuration (when kind == STF_EXEC_PLACE_DEVICE)
+    stf_exec_place_host host; //!< Host configuration (when kind == STF_EXEC_PLACE_HOST)
+  } u; //!< Configuration union
 } stf_exec_place;
 
+//! \brief Create execution place for CUDA device
+//!
+//! \param dev_id CUDA device index (0-based)
+//! \return Execution place configured for specified device
+//!
+//! \par Example:
+//! \code
+//! // Execute task on device 1
+//! stf_exec_place place = make_device_place(1);
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
 static inline stf_exec_place make_device_place(int dev_id)
 {
   stf_exec_place p;
@@ -65,6 +151,16 @@ static inline stf_exec_place make_device_place(int dev_id)
   return p;
 }
 
+//! \brief Create execution place for host (CPU)
+//!
+//! \return Execution place configured for host execution
+//!
+//! \par Example:
+//! \code
+//! // Execute task on host
+//! stf_exec_place place = make_host_place();
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
 static inline stf_exec_place make_host_place()
 {
   stf_exec_place p;
@@ -73,46 +169,74 @@ static inline stf_exec_place make_host_place()
   return p;
 }
 
+//! \}
+
+//! \defgroup DataPlace Data Places
+//! \brief Specify where logical data should be located
+//! \{
+
+//! \brief Device data place configuration
 typedef struct stf_data_place_device
 {
-  int dev_id;
+  int dev_id; //!< CUDA device ID for data placement
 } stf_data_place_device;
 
+//! \brief Host data place configuration
 typedef struct stf_data_place_host
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+  char dummy; //!< Dummy field for standard C compatibility
 } stf_data_place_host;
 
+//! \brief Managed memory data place configuration
 typedef struct stf_data_place_managed
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+  char dummy; //!< Dummy field for standard C compatibility
 } stf_data_place_managed;
 
+//! \brief Affine data place configuration
+//!
+//! Affine placement means data follows the execution location automatically.
 typedef struct stf_data_place_affine
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
+  char dummy; //!< Dummy field for standard C compatibility
 } stf_data_place_affine;
 
+//! \brief Data place type discriminator
 typedef enum stf_data_place_kind
 {
-  STF_DATA_PLACE_DEVICE,
-  STF_DATA_PLACE_HOST,
-  STF_DATA_PLACE_MANAGED,
-  STF_DATA_PLACE_AFFINE
+  STF_DATA_PLACE_DEVICE, //!< Data on specific device memory
+  STF_DATA_PLACE_HOST, //!< Data on host (CPU) memory
+  STF_DATA_PLACE_MANAGED, //!< Data in CUDA managed (unified) memory
+  STF_DATA_PLACE_AFFINE //!< Data follows execution place (default)
 } stf_data_place_kind;
 
+//! \brief Data placement specification
+//!
+//! Tagged union specifying where logical data should be located.
+//! Use helper functions to create (make_device_data_place(), etc.).
 typedef struct stf_data_place
 {
-  enum stf_data_place_kind kind;
+  enum stf_data_place_kind kind; //!< Type of data placement
   union
   {
-    stf_data_place_device device;
-    stf_data_place_host host;
-    stf_data_place_managed managed;
-    stf_data_place_affine affine;
-  } u;
+    stf_data_place_device device; //!< Device placement configuration
+    stf_data_place_host host; //!< Host placement configuration
+    stf_data_place_managed managed; //!< Managed memory configuration
+    stf_data_place_affine affine; //!< Affine placement configuration
+  } u; //!< Configuration union
 } stf_data_place;
 
+//! \brief Create data place for specific CUDA device
+//!
+//! \param dev_id CUDA device index (0-based)
+//! \return Data place configured for device memory
+//!
+//! \par Example:
+//! \code
+//! // Force data to device 1 even if task runs elsewhere
+//! stf_data_place dplace = make_device_data_place(1);
+//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace);
+//! \endcode
 static inline stf_data_place make_device_data_place(int dev_id)
 {
   stf_data_place p;
@@ -121,6 +245,16 @@ static inline stf_data_place make_device_data_place(int dev_id)
   return p;
 }
 
+//! \brief Create data place for host memory
+//!
+//! \return Data place configured for host (CPU) memory
+//!
+//! \par Example:
+//! \code
+//! // Keep data on host even for device tasks (sparse access)
+//! stf_data_place dplace = make_host_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace);
+//! \endcode
 static inline struct stf_data_place make_host_data_place()
 {
   stf_data_place p;
@@ -129,6 +263,18 @@ static inline struct stf_data_place make_host_data_place()
   return p;
 }
 
+//!
+//! \brief Create data place for CUDA managed memory
+//!
+//! \return Data place configured for managed (unified) memory
+//!
+//! \par Example:
+//! \code
+//! // Use managed memory for flexible access patterns
+//! stf_data_place dplace = make_managed_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace);
+//! \endcode
+
 static inline struct stf_data_place make_managed_data_place()
 {
   stf_data_place p;
@@ -137,6 +283,18 @@ static inline struct stf_data_place make_managed_data_place()
   return p;
 }
 
+//!
+//! \brief Create affine data place (follows execution location)
+//!
+//! \return Data place configured for affine placement (default behavior)
+//!
+//! \par Example:
+//! \code
+//! // Explicitly specify default behavior
+//! stf_data_place dplace = make_affine_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace);
+//! \endcode
+
 static inline struct stf_data_place make_affine_data_place()
 {
   stf_data_place p;
@@ -145,53 +303,708 @@ static inline struct stf_data_place make_affine_data_place()
   return p;
 }
 
+//! \}
+
+//! \defgroup Handles Opaque Handles
+//! \brief Opaque handle types for STF objects
+//! \{
+
+//!
+//! \brief Opaque handle for STF context
+//!
+//! Context stores the state of the STF library and serves as entry point for all API calls.
+//! Must be created with stf_ctx_create() or stf_ctx_create_graph() and destroyed with stf_ctx_finalize().
+
 typedef void* stf_ctx_handle;
 
+//!
+//! \brief Opaque handle for logical data
+//!
+//! Represents abstract data that may exist in multiple memory locations.
+//! Created with stf_logical_data() or stf_logical_data_empty() and destroyed with stf_logical_data_destroy().
+
+typedef void* stf_logical_data_handle;
+
+//!
+//! \brief Opaque handle for task
+//!
+//! Represents a computational task that operates on logical data.
+//! Created with stf_task_create() and destroyed with stf_task_destroy().
+
+typedef void* stf_task_handle;
+
+//!
+//! \brief Opaque handle for CUDA kernel task
+//!
+//! Specialized task optimized for CUDA kernel execution.
+//! Created with stf_cuda_kernel_create() and destroyed with stf_cuda_kernel_destroy().
+
+typedef void* stf_cuda_kernel_handle;
+
+//! \}
+
+//! \defgroup Context Context Management
+//! \brief Create, configure, and finalize STF contexts
+//! \{
+
+//!
+//! \brief Create STF context with stream backend
+//!
+//! Creates a new STF context using the default stream-based backend.
+//! Tasks are executed eagerly using CUDA streams and events.
+//!
+//! \param[out] ctx Pointer to receive context handle
+//!
+//! \pre ctx must not be NULL
+//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize()
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... use context ...
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_create_graph(), stf_ctx_finalize()
+
 void stf_ctx_create(stf_ctx_handle* ctx);
-// TODO stf_ctx_create_with_flags and an enum instead ?
+
+//!
+//! \brief Create STF context with graph backend
+//!
+//! Creates a new STF context using the CUDA graph backend.
+//! Tasks are captured into CUDA graphs and launched when needed,
+//! potentially providing better performance for repeated patterns.
+//!
+//! \param[out] ctx Pointer to receive context handle
+//!
+//! \pre ctx must not be NULL
+//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize()
+//!
+//! \note Graph backend has restrictions on stream synchronization within tasks
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create_graph(&ctx);
+//! // ... use context ...
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_create(), stf_ctx_finalize()
+
 void stf_ctx_create_graph(stf_ctx_handle* ctx);
+
+//!
+//! \brief Finalize STF context
+//!
+//! Waits for all pending operations to complete, performs write-back
+//! of modified data to host, and releases all associated resources.
+//!
+//! \param ctx Context handle to finalize
+//!
+//! \pre ctx must be valid context handle
+//! \post All pending operations completed, resources released, ctx becomes invalid
+//!
+//! \note This function blocks until all asynchronous operations complete
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... submit tasks ...
+//! stf_ctx_finalize(ctx);  // Blocks until completion
+//! \endcode
+//!
+//! \see stf_ctx_create(), stf_ctx_create_graph(), stf_fence()
+
 void stf_ctx_finalize(stf_ctx_handle ctx);
 
-// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ...
-// TODO stf_ctx_is_graph()
+//!
+//! \brief Get synchronization fence for context
+//!
+//! Returns a CUDA stream that will be signaled when all pending
+//! operations in the context complete. Provides non-blocking
+//! alternative to stf_ctx_finalize() for synchronization queries.
+//!
+//! \param ctx Context handle
+//! \return CUDA stream for synchronization
+//!
+//! \pre ctx must be valid context handle
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... submit tasks ...
+//!
+//! cudaStream_t fence = stf_fence(ctx);
+//! cudaStreamSynchronize(fence);  // Wait for completion
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_finalize()
 
 cudaStream_t stf_fence(stf_ctx_handle ctx);
 
-typedef void* stf_logical_data_handle;
+//! \}
+
+//! \defgroup LogicalData Logical Data Management
+//! \brief Create and manage abstract data handles
+//! \{
+
+//!
+//! \brief Create logical data from existing memory buffer
+//!
+//! Creates logical data handle from an existing host memory buffer.
+//! STF takes ownership of data management during task execution.
+//!
+//! \param ctx Context handle
+//! \param[out] ld Pointer to receive logical data handle
+//! \param addr Pointer to existing data buffer
+//! \param sz Size of data in bytes
+//!
+//! \pre ctx must be valid context handle
+//! \pre ld must not be NULL
+//! \pre addr must not be NULL
+//! \pre sz must be greater than 0
+//! \post *ld contains valid logical data handle
+//!
+//! \note Original data pointer should not be accessed during task execution
+//! \note Data will be written back when logical data is destroyed or context finalized
+//!
+//! \par Example:
+//! \code
+//! float data[1024];
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, sizeof(data));
+//! // ... use in tasks ...
+//! stf_logical_data_destroy(ld);
+//! \endcode
+//!
+//! \see stf_logical_data_empty(), stf_logical_data_destroy()
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
+
+//!
+//! \brief Set symbolic name for logical data
+//!
+//! Associates a human-readable name with logical data for debugging
+//! and task graph visualization.
+//!
+//! \param ld Logical data handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre ld must be valid logical data handle
+//! \pre symbol must not be NULL
+//!
+//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, size);
+//! stf_logical_data_set_symbol(ld, "input_matrix");
+//! \endcode
+//!
+//! \see stf_task_set_symbol()
+
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
+
+//!
+//! \brief Destroy logical data handle
+//!
+//! Destroys logical data handle and releases associated resources.
+//! Triggers write-back to host if data was modified.
+//!
+//! \param ld Logical data handle to destroy
+//!
+//! \pre ld must be valid logical data handle
+//! \post ld becomes invalid, resources released
+//!
+//! \note Must be called for every created logical data handle
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, size);
+//! // ... use in tasks ...
+//! stf_logical_data_destroy(ld);  // Cleanup
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_empty()
+
 void stf_logical_data_destroy(stf_logical_data_handle ld);
+
+//!
+//! \brief Create empty logical data (temporary)
+//!
+//! Creates logical data of specified size without backing host memory.
+//! Useful for temporary buffers in multi-stage computations.
+//!
+//! \param ctx Context handle
+//! \param length Size in bytes
+//! \param[out] to Pointer to receive logical data handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre length must be greater than 0
+//! \pre to must not be NULL
+//! \post *to contains valid logical data handle
+//!
+//! \note First access must be write-only (STF_WRITE)
+//! \note No write-back occurs since there's no host backing
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle temp;
+//! stf_logical_data_empty(ctx, 1024 * sizeof(float), &temp);
+//!
+//! // First access must be write-only
+//! stf_task_add_dep(task, temp, STF_WRITE);
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_destroy()
+
 void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to);
 
-// TODO
-// void stf_logical_data_wait(stf_logical_data_handle ld);
+//!
+//! \brief Create synchronization token
+//!
+//! Creates a logical data handle for synchronization purposes only.
+//! Contains no actual data but can be used to enforce execution order.
+//!
+//! \param ctx Context handle
+//! \param[out] ld Pointer to receive token handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre ld must not be NULL
+//! \post *ld contains valid token handle
+//!
+//! \note More efficient than using dummy data for synchronization
+//! \note Can be accessed with any access mode
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle sync_token;
+//! stf_token(ctx, &sync_token);
+//!
+//! // Task 1 signals completion
+//! stf_task_add_dep(task1, sync_token, STF_WRITE);
+//!
+//! // Task 2 waits for task1
+//! stf_task_add_dep(task2, sync_token, STF_READ);
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_destroy()
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 
-typedef void* stf_task_handle;
+//! \}
+
+//! \defgroup TaskManagement Task Management
+//! \brief Create, configure, and execute computational tasks
+//! \{
+
+//!
+//! \brief Create new task
+//!
+//! Creates a new task within the specified context. Task is created
+//! but not configured or executed. Use other stf_task_* functions
+//! to configure execution place, add dependencies, and execute.
+//!
+//! \param ctx Context handle
+//! \param[out] t Pointer to receive task handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre t must not be NULL
+//! \post *t contains valid task handle
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! // ... configure task ...
+//! stf_task_destroy(task);
+//! \endcode
+//!
+//! \see stf_task_destroy(), stf_task_set_exec_place(), stf_task_add_dep()
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
+
+//!
+//! \brief Set task execution place
+//!
+//! Specifies where the task should execute (device or host).
+//! If not called, defaults to current device.
+//!
+//! \param t Task handle
+//! \param exec_p Pointer to execution place specification
+//!
+//! \pre t must be valid task handle
+//! \pre exec_p must not be NULL
+//! \pre Must be called before stf_task_start()
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//!
+//! // Execute on device 1
+//! stf_exec_place place = make_device_place(1);
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
+//!
+//! \see make_device_place(), make_host_place()
+
 void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
+
+//!
+//! \brief Set symbolic name for task
+//!
+//! Associates a human-readable name with task for debugging
+//! and task graph visualization.
+//!
+//! \param t Task handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre t must be valid task handle
+//! \pre symbol must not be NULL
+//!
+//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! stf_task_set_symbol(task, "matrix_multiply");
+//! \endcode
+//!
+//! \see stf_logical_data_set_symbol()
+
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
+
+//!
+//! \brief Add data dependency to task
+//!
+//! Adds a data dependency with specified access mode. Order of calls
+//! determines index for stf_task_get(). Dependencies determine
+//! automatic task synchronization.
+//!
+//! \param t Task handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//!
+//! \pre t must be valid task handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//!
+//! \par Example:
+//! \code
+//! stf_task_add_dep(task, input_data, STF_READ);    // Index 0
+//! stf_task_add_dep(task, output_data, STF_WRITE);  // Index 1
+//! stf_task_add_dep(task, temp_data, STF_RW);       // Index 2
+//! \endcode
+//!
+//! \see stf_task_add_dep_with_dplace(), stf_task_get()
+
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
+
+//!
+//! \brief Add data dependency with explicit data placement
+//!
+//! Adds data dependency with specified access mode and explicit
+//! data placement. Overrides default affine placement.
+//!
+//! \param t Task handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//! \param data_p Pointer to data place specification
+//!
+//! \pre t must be valid task handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//! \pre data_p must not be NULL
+//!
+//! \par Example:
+//! \code
+//! // Force data to device 0 even if task runs elsewhere
+//! stf_data_place dplace = make_device_data_place(0);
+//! stf_task_add_dep_with_dplace(task, ld, STF_READ, &dplace);
+//! \endcode
+//!
+//! \see stf_task_add_dep(), make_device_data_place(), make_host_data_place()
+
 void stf_task_add_dep_with_dplace(
   stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p);
+
+//!
+//! \brief Begin task execution
+//!
+//! Starts task execution. After this call, use stf_task_get_custream()
+//! and stf_task_get() to access CUDA stream and data pointers.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//! \pre Task dependencies must already be configured
+//! \post Task is executing, stream and data available
+//!
+//! \par Example:
+//! \code
+//! // Configure task first
+//! stf_task_add_dep(task, data, STF_RW);
+//!
+//! // Start execution
+//! stf_task_start(task);
+//!
+//! // Now can access stream and data
+//! CUstream stream = stf_task_get_custream(task);
+//! float* ptr = (float*)stf_task_get(task, 0);
+//! \endcode
+//!
+//! \see stf_task_end(), stf_task_get_custream(), stf_task_get()
+
 void stf_task_start(stf_task_handle t);
+
+//!
+//! \brief End task execution
+//!
+//! Ends task execution. Call after all CUDA operations are
+//! submitted to the task stream.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//! \post Task execution ended, may continue asynchronously
+//!
+//! \par Example:
+//! \code
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//!
+//! // Launch operations
+//! my_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(args...);
+//!
+//! stf_task_end(task);  // Operations may still be running
+//! \endcode
+//!
+//! \see stf_task_start()
+
 void stf_task_end(stf_task_handle t);
+
+//!
+//! \brief Get CUDA stream for task
+//!
+//! Returns CUDA stream associated with the task. All CUDA operations
+//! within task must use this stream for proper synchronization.
+//!
+//! \param t Task handle
+//! \return CUDA stream for launching operations
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//!
+//! \par Example:
+//! \code
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//!
+//! // Launch kernel using this stream
+//! my_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(args...);
+//! \endcode
+//!
+//! \see stf_task_start(), stf_task_get()
+
 CUstream stf_task_get_custream(stf_task_handle t);
+
+//!
+//! \brief Get data pointer for task dependency
+//!
+//! Returns pointer to logical data instance for specified dependency.
+//! Index corresponds to order of stf_task_add_dep() calls.
+//!
+//! \param t Task handle
+//! \param submitted_index Dependency index (0-based)
+//! \return Pointer to data (cast to appropriate type)
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//! \pre submitted_index must be valid dependency index
+//! \post Pointer valid until stf_task_end()
+//!
+//! \par Example:
+//! \code
+//! // Dependencies added in this order:
+//! stf_task_add_dep(task, input, STF_READ);     // Index 0
+//! stf_task_add_dep(task, output, STF_WRITE);   // Index 1
+//!
+//! stf_task_start(task);
+//!
+//! // Get data pointers
+//! const float* in = (const float*)stf_task_get(task, 0);
+//! float* out = (float*)stf_task_get(task, 1);
+//! \endcode
+//!
+//! \see stf_task_add_dep(), stf_task_start()
+
 void* stf_task_get(stf_task_handle t, int submitted_index);
+
+//!
+//! \brief Destroy task handle
+//!
+//! Destroys task handle and releases associated resources.
+//! Task should be completed before destruction.
+//!
+//! \param t Task handle to destroy
+//!
+//! \pre t must be valid task handle
+//! \post t becomes invalid, resources released
+//!
+//! \note Must be called for every created task
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! // ... configure and execute task ...
+//! stf_task_destroy(task);
+//! \endcode
+//!
+//! \see stf_task_create()
+
 void stf_task_destroy(stf_task_handle t);
+
+//!
+//! \brief Enable graph capture for task (advanced)
+//!
+//! Enables graph capture optimization for the task.
+//! Advanced feature typically not needed for basic usage.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//!
+//! \note Used internally for CUDA graph backend optimization
+
 void stf_task_enable_capture(stf_task_handle t);
 
-typedef void* stf_cuda_kernel_handle;
+//! \}
+
+//! \defgroup CUDAKernel CUDA Kernel Interface
+//! \brief Optimized interface for CUDA kernel execution
+//! \{
+
+//!
+//! \brief Create CUDA kernel task
+//!
+//! Creates a specialized task optimized for CUDA kernel execution.
+//! More efficient than generic tasks for repeated kernel launches,
+//! especially with CUDA graph backend.
+//!
+//! \param ctx Context handle
+//! \param[out] k Pointer to receive kernel handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre k must not be NULL
+//! \post *k contains valid kernel handle
+//!
+//! \par Example:
+//! \code
+//! stf_cuda_kernel_handle kernel;
+//! stf_cuda_kernel_create(ctx, &kernel);
+//! // ... configure kernel ...
+//! stf_cuda_kernel_destroy(kernel);
+//! \endcode
+//!
+//! \see stf_cuda_kernel_destroy(), stf_task_create()
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
+
+//!
+//! \brief Set kernel execution place
+//!
+//! Specifies where the CUDA kernel should execute.
+//!
+//! \param k Kernel handle
+//! \param exec_p Pointer to execution place specification
+//!
+//! \pre k must be valid kernel handle
+//! \pre exec_p must not be NULL
+//!
+//! \see make_device_place(), stf_task_set_exec_place()
+
 void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
+
+//!
+//! \brief Set symbolic name for kernel
+//!
+//! Associates human-readable name with kernel for debugging.
+//!
+//! \param k Kernel handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre k must be valid kernel handle
+//! \pre symbol must not be NULL
+//!
+//! \see stf_task_set_symbol(), stf_logical_data_set_symbol()
+
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
+
+//!
+//! \brief Add data dependency to kernel
+//!
+//! Adds data dependency with specified access mode for kernel execution.
+//!
+//! \param k Kernel handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//!
+//! \pre k must be valid kernel handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//!
+//! \see stf_task_add_dep()
+
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
+
+//!
+//! \brief Start kernel execution
+//!
+//! Begins kernel execution phase. After this, add kernel descriptions
+//! with stf_cuda_kernel_add_desc().
+//!
+//! \param k Kernel handle
+//!
+//! \pre k must be valid kernel handle
+//! \pre Dependencies must already be configured
+//!
+//! \see stf_cuda_kernel_add_desc(), stf_cuda_kernel_end()
+
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
 
+//!
+//! \brief Add CUDA kernel launch description (driver API)
+//!
+//! Adds kernel launch specification using CUDA driver API function handle.
+//! This is the low-level interface used internally.
+//!
+//! \param k Kernel handle
+//! \param cufunc CUDA driver API function handle
+//! \param grid_dim_ CUDA grid dimensions
+//! \param block_dim_ CUDA block dimensions
+//! \param shared_mem_ Shared memory size in bytes
+//! \param arg_cnt Number of kernel arguments
+//! \param args Array of pointers to kernel arguments
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre cufunc must be valid CUfunction
+//! \pre args must contain arg_cnt valid argument pointers
+//!
+//! \see stf_cuda_kernel_add_desc()
+
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,
@@ -201,8 +1014,46 @@ void stf_cuda_kernel_add_desc_cufunc(
   int arg_cnt,
   const void** args);
 
-/* Convert CUDA kernel address to CUfunction because we may use them from a
- * shared library where this would be invalid in the runtime API. */
+//!
+//! \brief Add CUDA kernel launch description
+//!
+//! Adds kernel launch specification using runtime API function pointer.
+//! Automatically converts to driver API internally.
+//!
+//! \param k Kernel handle
+//! \param func Pointer to __global__ function
+//! \param grid_dim_ CUDA grid dimensions
+//! \param block_dim_ CUDA block dimensions
+//! \param shared_mem_ Shared memory size in bytes
+//! \param arg_cnt Number of kernel arguments
+//! \param args Array of pointers to kernel arguments
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre func must be valid __global__ function pointer
+//! \pre args must contain arg_cnt valid argument pointers
+//!
+//! \note Converts function pointer to CUfunction automatically
+//!
+//! \par Example:
+//! \code
+//! // Kernel: __global__ void axpy(float alpha, float* x, float* y)
+//! stf_cuda_kernel_start(kernel);
+//!
+//! // Prepare arguments
+//! float alpha = 2.0f;
+//! float* d_x = (float*)stf_cuda_kernel_get_arg(kernel, 0);
+//! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1);
+//! const void* args[] = {&alpha, &d_x, &d_y};
+//!
+//! // Launch kernel
+//! stf_cuda_kernel_add_desc(kernel, (void*)axpy,
+//! dim3(16), dim3(128), 0, 3, args);
+//! stf_cuda_kernel_end(kernel);
+//! \endcode
+//!
+//! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg()
+
 static inline void stf_cuda_kernel_add_desc(
   stf_cuda_kernel_handle k,
   const void* func,
@@ -219,9 +1070,56 @@ static inline void stf_cuda_kernel_add_desc(
   stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
 }
 
+//!
+//! \brief Get kernel argument data pointer
+//!
+//! Returns pointer to logical data for use as kernel argument.
+//! Index corresponds to order of stf_cuda_kernel_add_dep() calls.
+//!
+//! \param k Kernel handle
+//! \param index Dependency index (0-based)
+//! \return Pointer to data for kernel argument
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre index must be valid dependency index
+//!
+//! \see stf_cuda_kernel_add_desc(), stf_task_get()
+
 void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index);
+
+//!
+//! \brief End kernel execution
+//!
+//! Ends kernel execution phase. Call after all kernel descriptions
+//! are added with stf_cuda_kernel_add_desc().
+//!
+//! \param k Kernel handle
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//!
+//! \see stf_cuda_kernel_start()
+
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k);
-void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t);
+
+//!
+//! \brief Destroy kernel handle
+//!
+//! Destroys kernel handle and releases associated resources.
+//!
+//! \param k Kernel handle to destroy
+//!
+//! \pre k must be valid kernel handle
+//! \post k becomes invalid, resources released
+//!
+//! \note Must be called for every created kernel
+//!
+//! \see stf_cuda_kernel_create()
+
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle k);
+
+//! \}
 
 #ifdef __cplusplus
 }

From 60266ff4f299a915eb565ec11287ab4a06fbb99d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 16:59:22 +0200
Subject: [PATCH 156/221] Better implementation of the help to convert C places
 to the C++ API, and define invalid values

---
 c/experimental/stf/src/stf.cu | 44 +++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index a2303a1f5a0..14a03de7f5a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -94,13 +94,20 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
 /* Convert the C-API stf_exec_place to a C++ exec_place object */
 exec_place to_exec_place(stf_exec_place* exec_p)
 {
-  if (exec_p->kind == STF_EXEC_PLACE_HOST)
+  assert(exec_p);
+
+  switch (exec_p->kind)
   {
-    return exec_place::host();
-  }
+    case STF_EXEC_PLACE_HOST:
+      return exec_place::host();
 
-  assert(exec_p->kind == STF_EXEC_PLACE_DEVICE);
-  return exec_place::device(exec_p->u.device.dev_id);
+    case STF_EXEC_PLACE_DEVICE:
+      return exec_place::device(exec_p->u.device.dev_id);
+
+    default:
+      assert(false && "Invalid execution place kind");
+      return exec_place{}; // invalid exec_place
+  }
 }
 
 /* Convert the C-API stf_data_place to a C++ data_place object */
@@ -108,23 +115,24 @@ data_place to_data_place(stf_data_place* data_p)
 {
   assert(data_p);
 
-  if (data_p->kind == STF_DATA_PLACE_HOST)
+  switch (data_p->kind)
   {
-    return data_place::host();
-  }
+    case STF_DATA_PLACE_HOST:
+      return data_place::host();
 
-  if (data_p->kind == STF_DATA_PLACE_MANAGED)
-  {
-    return data_place::managed();
-  }
+    case STF_DATA_PLACE_MANAGED:
+      return data_place::managed();
 
-  if (data_p->kind == STF_DATA_PLACE_AFFINE)
-  {
-    return data_place::affine();
-  }
+    case STF_DATA_PLACE_AFFINE:
+      return data_place::affine();
+
+    case STF_DATA_PLACE_DEVICE:
+      return data_place::device(data_p->u.device.dev_id);
 
-  assert(data_p->kind == STF_DATA_PLACE_DEVICE);
-  return data_place::device(data_p->u.device.dev_id);
+    default:
+      assert(false && "Invalid data place kind");
+      return data_place::invalid(); // invalid data_place
+  }
 }
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)

From 59f198304c5125765b9b0e097b54bb13aa027180 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 17:04:15 +0200
Subject: [PATCH 157/221] Tell where to find cudax, and remove unnecessary libs

---
 c/experimental/stf/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 85f9bdb4c34..b44b0cbbcca 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -27,13 +27,16 @@ if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
   )
 endif()
 
+find_package(cudax REQUIRED CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cudax/"
+)
+
 find_package(CUDAToolkit REQUIRED)
 set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
 target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cudart_static
-  CUDA::nvrtc
-  CUDA::nvJitLink
   CUDA::cuda_driver
   CCCL::cudax
 )

From 97dd6f7057597d22cccf3fd92d2107b218268e61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 21:40:26 +0200
Subject: [PATCH 158/221] CCCL_ENABLE_C enables c/parallel,
 CCCL_ENABLE_C_EXPERIMENTAL_STF enables c/experimental/stf/

---
 CMakeLists.txt    | 7 +++++++
 CMakePresets.json | 6 +++++-
 c/CMakeLists.txt  | 2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0830d733b32..034191f0d5c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF)
 option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF)
 option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF)
 option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF)
+option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF)
 
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   set(CCCL_ENABLE_BENCHMARKS OFF)
@@ -86,6 +87,12 @@ if (CCCL_ENABLE_C)
   add_subdirectory(c)
 endif()
 
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  add_subdirectory(c/experimental/stf)
+endif()
+
+
+
 if (CCCL_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 3b86d9c813d..376f90d4f12 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -22,6 +22,7 @@
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
         "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
         "Thrust_ENABLE_INSTALL_RULES": true,
@@ -64,6 +65,7 @@
         "CCCL_ENABLE_EXAMPLES": true,
         "CCCL_ENABLE_BENCHMARKS": true,
         "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
         "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true,
         "CUB_ENABLE_TESTING": true,
@@ -272,6 +274,7 @@
       "inherits": "base",
       "cacheVariables": {
         "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
         "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false
@@ -282,7 +285,8 @@
       "displayName": "CCCL C CUDASTF Library",
       "inherits": "base",
       "cacheVariables": {
-        "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
         "CCCL_C_Parallel_ENABLE_TESTING": false,
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false,
         "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 364494da7a0..af1cc1a4234 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(parallel)
-add_subdirectory(experimental/stf/)
+# add_subdirectory(experimental/stf/)

From 1610f0b20af2f11512af4dc8b8de9512f5b528a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 3 Sep 2025 22:03:56 +0200
Subject: [PATCH 159/221] Remove unnecessary definitions

---
 CMakePresets.json                      | 5 +----
 c/experimental/stf/test/CMakeLists.txt | 7 -------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 376f90d4f12..a675873c21d 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -276,8 +276,7 @@
         "CCCL_ENABLE_C": true,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
-        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
-        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": false
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
       }
     },
     {
@@ -287,8 +286,6 @@
       "cacheVariables": {
         "CCCL_ENABLE_C": false,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
-        "CCCL_C_Parallel_ENABLE_TESTING": false,
-        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": false,
         "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
     },
diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt
index f5613253a81..5776747de79 100644
--- a/c/experimental/stf/test/CMakeLists.txt
+++ b/c/experimental/stf/test/CMakeLists.txt
@@ -18,13 +18,6 @@ function(cccl_c_experimental_stf_add_test target_name_var source)
     CCCL::cudax
   )
 
-  target_compile_definitions(${target_name} PRIVATE
-    TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
-    TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust"
-    TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
-    TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
-  )
-
   add_test(NAME ${target_name} COMMAND ${target_name})
 endfunction()
 

From 90a8d20661d3b2ba0acc2855a33b1c1ef054a791 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Fri, 5 Sep 2025 21:37:55 +0200
Subject: [PATCH 160/221] use more consistent option names

---
 CMakeLists.txt                  |  8 ++------
 CMakePresets.json               | 10 +++++-----
 c/CMakeLists.txt                |  9 +++++++--
 python/cuda_cccl/CMakeLists.txt |  2 +-
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 034191f0d5c..5467357fabb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(CCCL_ENABLE_CUB "Enable the CUB developer build." OFF)
 option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF)
 option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF)
 option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF)
-option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF)
+option(CCCL_ENABLE_C_PARALLEL "Enable CUDA C Parallel Library." OFF)
 option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF)
 
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
@@ -83,14 +83,10 @@ if (CCCL_ENABLE_UNSTABLE)
   add_subdirectory(cudax)
 endif()
 
-if (CCCL_ENABLE_C)
+if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF)
   add_subdirectory(c)
 endif()
 
-if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
-  add_subdirectory(c/experimental/stf)
-endif()
-
 
 
 if (CCCL_ENABLE_TESTING)
diff --git a/CMakePresets.json b/CMakePresets.json
index a675873c21d..e4e012c60d2 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -21,7 +21,7 @@
         "CCCL_ENABLE_CUDAX": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
-        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_PARALLEL": false,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
@@ -64,7 +64,7 @@
         "CCCL_ENABLE_TESTING": true,
         "CCCL_ENABLE_EXAMPLES": true,
         "CCCL_ENABLE_BENCHMARKS": true,
-        "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_PARALLEL": true,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
         "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true,
@@ -273,7 +273,7 @@
       "displayName": "CCCL C Parallel Library",
       "inherits": "base",
       "cacheVariables": {
-        "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_PARALLEL": true,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
@@ -284,7 +284,7 @@
       "displayName": "CCCL C CUDASTF Library",
       "inherits": "base",
       "cacheVariables": {
-        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_PARALLEL": false,
         "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
         "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
@@ -310,7 +310,7 @@
         "CCCL_ENABLE_THRUST": false,
         "CCCL_ENABLE_LIBCUDACXX": false,
         "CCCL_ENABLE_CUDAX": false,
-        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_PARALLEL": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
         "CUB_ENABLE_EXAMPLES": false,
diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index af1cc1a4234..f0a1826d519 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1,2 +1,7 @@
-add_subdirectory(parallel)
-# add_subdirectory(experimental/stf/)
+if (CCCL_ENABLE_C_PARALLEL)
+  add_subdirectory(parallel)
+endif()
+
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  add_subdirectory(experimental/stf)
+endif()
diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 5ed8aaa9e46..f241d948d22 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
 cccl_build_compiler_targets()
 
 # Build and install C++ library first
-set(CCCL_ENABLE_C ON)
+set(CCCL_ENABLE_C_PARALLEL ON)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 add_subdirectory(${_cccl_root} _parent_cccl)
 

From ac667ca25962f001d2735d2117f9993ca5dfe388 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 09:46:39 +0200
Subject: [PATCH 161/221] Do not use [[maybe_unused]] for the C lib header
 because this is only available from C23

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 7e012830408..d4f44cdbf96 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1064,8 +1064,9 @@ static inline void stf_cuda_kernel_add_desc(
   const void** args)
 {
   CUfunction cufunc;
-  [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
+  cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
   assert(res == cudaSuccess);
+  (void) res; /* suppress unused variable warning in release builds */
 
   stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
 }

From 5bf62b365d92506b0ffc41f254615578b87d07b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 10:01:45 +0200
Subject: [PATCH 162/221] Return an error code in stf_cuda_kernel_add_desc
 rather than use assertions

---
 .../stf/include/cccl/c/experimental/stf/stf.h   | 17 ++++++++++-------
 c/experimental/stf/test/test_cuda_kernel.cu     |  3 ++-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index d4f44cdbf96..2c5515aaa0f 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -69,7 +69,6 @@
 #  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
 #endif // !CCCL_C_EXPERIMENTAL
 
-#include <assert.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -1028,6 +1027,8 @@ void stf_cuda_kernel_add_desc_cufunc(
 //! \param arg_cnt Number of kernel arguments
 //! \param args Array of pointers to kernel arguments
 //!
+//! \return cudaSuccess on success, or appropriate cudaError_t on failure
+//!
 //! \pre k must be valid kernel handle
 //! \pre stf_cuda_kernel_start() must have been called
 //! \pre func must be valid __global__ function pointer
@@ -1046,15 +1047,15 @@ void stf_cuda_kernel_add_desc_cufunc(
 //! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1);
 //! const void* args[] = {&alpha, &d_x, &d_y};
 //!
-//! // Launch kernel
-//! stf_cuda_kernel_add_desc(kernel, (void*)axpy,
-//! dim3(16), dim3(128), 0, 3, args);
+//! // Launch kernel (caller must handle return values != cudaSuccess)
+//! cudaError_t err = stf_cuda_kernel_add_desc(kernel, (void*)axpy,
+//!                                           dim3(16), dim3(128), 0, 3, args);
 //! stf_cuda_kernel_end(kernel);
 //! \endcode
 //!
 //! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg()
 
-static inline void stf_cuda_kernel_add_desc(
+static inline cudaError_t stf_cuda_kernel_add_desc(
   stf_cuda_kernel_handle k,
   const void* func,
   dim3 grid_dim_,
@@ -1065,10 +1066,12 @@ static inline void stf_cuda_kernel_add_desc(
 {
   CUfunction cufunc;
   cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
-  assert(res == cudaSuccess);
-  (void) res; /* suppress unused variable warning in release builds */
+  if (res != cudaSuccess) {
+    return res;
+  }
 
   stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
+  return cudaSuccess;
 }
 
 //!
diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index b5ba66b0f3a..12dc178d061 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -70,7 +70,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
   double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
   const void* args[4] = {&N, &alpha, &dX, &dY};
-  stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  REQUIRE(err == cudaSuccess);
   stf_cuda_kernel_end(k);
   stf_cuda_kernel_destroy(k);
 

From c0a54f1ab5d19eda56db4bbcc9d82decd586f3a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 10:02:16 +0200
Subject: [PATCH 163/221] clang-format

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 3 ++-
 c/experimental/stf/test/test_cuda_kernel.cu              | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 2c5515aaa0f..903b71cd878 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1066,7 +1066,8 @@ static inline cudaError_t stf_cuda_kernel_add_desc(
 {
   CUfunction cufunc;
   cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
-  if (res != cudaSuccess) {
+  if (res != cudaSuccess)
+  {
     return res;
   }
 
diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index 12dc178d061..05c0e7e8620 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -70,7 +70,7 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
   double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
   const void* args[4] = {&N, &alpha, &dX, &dY};
-  cudaError_t err = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  cudaError_t err     = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
   REQUIRE(err == cudaSuccess);
   stf_cuda_kernel_end(k);
   stf_cuda_kernel_destroy(k);

From af43da5de5187f5373f20172f2fb19c787c2cead Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:03:03 +0200
Subject: [PATCH 164/221] Merge stf_c_lib: Update c/ directory with complete C
 library implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace c/ directory contents with complete stf_c_lib version
- Update stf.cu (297→350 lines) with proper struct definitions and cleaner implementation
- Fix Python linting issues (E711, E402, F841) in STF test files
- Ensure pre-commit hooks pass

Now stf_c_api has both complete C library and Python bindings.
---
 c/CMakeLists.txt                              |    9 +-
 c/experimental/stf/CMakeLists.txt             |   14 +-
 .../stf/include/cccl/c/experimental/stf/stf.h | 1083 +++++++++++++++--
 c/experimental/stf/src/stf.cu                 |  229 ++--
 c/experimental/stf/test/CMakeLists.txt        |    7 -
 c/experimental/stf/test/test_cuda_kernel.cu   |    3 +-
 .../cuda/cccl/experimental/stf/decorator.py   |    2 +-
 python/cuda_cccl/tests/stf/test_decorator.py  |    2 +-
 python/cuda_cccl/tests/stf/test_fhe.py        |    3 +-
 .../cuda_cccl/tests/stf/test_fhe_decorator.py |    3 +-
 python/cuda_cccl/tests/stf/test_numba.py      |    2 +-
 python/cuda_cccl/tests/stf/test_pytorch.py    |    6 +-
 .../tests/stf/test_stencil_decorator.py       |    2 +-
 13 files changed, 1160 insertions(+), 205 deletions(-)

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 364494da7a0..f0a1826d519 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -1,2 +1,7 @@
-add_subdirectory(parallel)
-add_subdirectory(experimental/stf/)
+if (CCCL_ENABLE_C_PARALLEL)
+  add_subdirectory(parallel)
+endif()
+
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  add_subdirectory(experimental/stf)
+endif()
diff --git a/c/experimental/stf/CMakeLists.txt b/c/experimental/stf/CMakeLists.txt
index 11599edaec7..b44b0cbbcca 100644
--- a/c/experimental/stf/CMakeLists.txt
+++ b/c/experimental/stf/CMakeLists.txt
@@ -27,12 +27,16 @@ if (CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY)
   )
 endif()
 
+find_package(cudax REQUIRED CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cudax/"
+)
+
 find_package(CUDAToolkit REQUIRED)
 set_target_properties(cccl.c.experimental.stf PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
+target_compile_definitions(cccl.c.experimental.stf PUBLIC CCCL_C_EXPERIMENTAL=1)
 target_link_libraries(cccl.c.experimental.stf PRIVATE
   CUDA::cudart_static
-  CUDA::nvrtc
-  CUDA::nvJitLink
   CUDA::cuda_driver
   CCCL::cudax
 )
@@ -43,10 +47,6 @@ target_compile_options(cccl.c.experimental.stf PRIVATE $<$<COMPILE_LANG_AND_ID:C
 target_include_directories(cccl.c.experimental.stf PUBLIC "include")
 target_include_directories(cccl.c.experimental.stf PRIVATE "src")
 
-if (CCCL_C_Parallel_ENABLE_TESTING)
+if (CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
-
-# if (CCCL_C_Parallel_ENABLE_HEADER_TESTING)
-#   include(cmake/CParallelHeaderTesting.cmake)
-# endif()
diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 6f2f903e6c8..903b71cd878 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1,223 +1,1130 @@
-#include <assert.h>
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+//! \file stf.h
+//! \brief CUDA STF (Sequential Task Flow) C Interface
+//!
+//! \details
+//! This header provides a C interface to the CUDA STF C++ library, enabling
+//! task-based parallel programming with automatic data movement and dependency management.
+//!
+//! The Sequential Task Flow programming model involves defining logical data and
+//! submitting tasks that operate on this data. STF automatically deduces dependencies
+//! between tasks and orchestrates both computation and data movement to ensure
+//! efficient execution with maximum concurrency.
+//!
+//! \par Key Concepts:
+//! - **Logical Data**: Abstract handles for data that may exist in multiple locations
+//! - **Tasks**: Operations that consume and produce logical data with specified access modes
+//! - **Dependencies**: Automatically inferred from data access patterns (RAW, WAR, WAW)
+//! - **Execution Places**: Specify where tasks run (CPU, specific GPU devices)
+//! - **Data Places**: Specify where data should be located in memory hierarchy
+//!
+//! \par Basic Usage Pattern:
+//! \code
+//! // 1. Create STF context
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//!
+//! // 2. Create logical data from arrays
+//! float X[1024], Y[1024];
+//! stf_logical_data_handle lX, lY;
+//! stf_logical_data(ctx, &lX, X, sizeof(X));
+//! stf_logical_data(ctx, &lY, Y, sizeof(Y));
+//!
+//! // 3. Create and configure task
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! stf_task_add_dep(task, lX, STF_READ);  // X is read-only
+//! stf_task_add_dep(task, lY, STF_RW);    // Y is read-write
+//!
+//! // 4. Execute task
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//! float* x_ptr = (float*)stf_task_get(task, 0);
+//! float* y_ptr = (float*)stf_task_get(task, 1);
+//! // ... launch CUDA operations using stream ...
+//! stf_task_end(task);
+//!
+//! // 5. Cleanup
+//! stf_ctx_finalize(ctx);
+//! stf_task_destroy(task);
+//! stf_logical_data_destroy(lX);
+//! stf_logical_data_destroy(lY);
+//! \endcode
+//!
+//! \warning This API is experimental and subject to change.
+//!          Define CCCL_C_EXPERIMENTAL to acknowledge this.
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-// TODO use CCCL_C_EXTERN_C_BEGIN/CCCL_C_EXTERN_C_END
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+//! \defgroup AccessMode Data Access Modes
+//! \brief Specifies how tasks access logical data
+//! \{
+
+//! \brief Data access mode for task dependencies
+//!
+//! Specifies how a task will access logical data, which determines
+//! synchronization requirements and concurrency opportunities.
 typedef enum stf_access_mode
 {
-  STF_NONE  = 0,
-  STF_READ  = 1 << 0,
-  STF_WRITE = 1 << 1,
-  STF_RW    = STF_READ | STF_WRITE
+  STF_NONE  = 0, //!< No access (invalid)
+  STF_READ  = 1 << 0, //!< Read-only access - allows concurrent readers
+  STF_WRITE = 1 << 1, //!< Write-only access - requires exclusive access
+  STF_RW    = STF_READ | STF_WRITE //!< Read-write access - requires exclusive access
 } stf_access_mode;
 
-struct stf_exec_place_device
+//! \}
+
+//! \defgroup ExecPlace Execution Places
+//! \brief Specify where tasks should execute
+//! \{
+
+//! \brief Device execution place configuration
+typedef struct stf_exec_place_device
 {
-  int dev_id;
-};
+  int dev_id; //!< CUDA device ID (0-based)
+} stf_exec_place_device;
 
-struct stf_exec_place_host
+//! \brief Host execution place configuration
+typedef struct stf_exec_place_host
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+  char dummy; //!< Dummy field for standard C compatibility
+} stf_exec_place_host;
 
+//! \brief Execution place type discriminator
 typedef enum stf_exec_place_kind
 {
-  STF_EXEC_PLACE_DEVICE,
-  STF_EXEC_PLACE_HOST
+  STF_EXEC_PLACE_DEVICE, //!< Task executes on CUDA device
+  STF_EXEC_PLACE_HOST //!< Task executes on host (CPU)
 } stf_exec_place_kind;
 
-struct stf_exec_place
+//! \brief Execution place specification
+//!
+//! Tagged union specifying where a task should execute.
+//! Use helper functions make_device_place() and make_host_place() to create.
+typedef struct stf_exec_place
 {
-  enum stf_exec_place_kind kind;
+  enum stf_exec_place_kind kind; //!< Type of execution place
   union
   {
-    struct stf_exec_place_device device;
-    struct stf_exec_place_host host;
-  } u;
-};
+    stf_exec_place_device device; //!< Device configuration (when kind == STF_EXEC_PLACE_DEVICE)
+    stf_exec_place_host host; //!< Host configuration (when kind == STF_EXEC_PLACE_HOST)
+  } u; //!< Configuration union
+} stf_exec_place;
 
-static inline struct stf_exec_place make_device_place(int dev_id)
+//! \brief Create execution place for CUDA device
+//!
+//! \param dev_id CUDA device index (0-based)
+//! \return Execution place configured for specified device
+//!
+//! \par Example:
+//! \code
+//! // Execute task on device 1
+//! stf_exec_place place = make_device_place(1);
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
+static inline stf_exec_place make_device_place(int dev_id)
 {
-  struct stf_exec_place p;
+  stf_exec_place p;
   p.kind            = STF_EXEC_PLACE_DEVICE;
   p.u.device.dev_id = dev_id;
   return p;
 }
 
-static inline struct stf_exec_place make_host_place()
+//! \brief Create execution place for host (CPU)
+//!
+//! \return Execution place configured for host execution
+//!
+//! \par Example:
+//! \code
+//! // Execute task on host
+//! stf_exec_place place = make_host_place();
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
+static inline stf_exec_place make_host_place()
 {
-  struct stf_exec_place p;
+  stf_exec_place p;
   p.kind         = STF_EXEC_PLACE_HOST;
   p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
-typedef struct stf_exec_place_device stf_exec_place_device;
-typedef struct stf_exec_place_host stf_exec_place_host;
-typedef union stf_exec_place_u stf_exec_place_u;
-typedef struct stf_exec_place stf_exec_place;
+//! \}
+
+//! \defgroup DataPlace Data Places
+//! \brief Specify where logical data should be located
+//! \{
 
-struct stf_data_place_device
+//! \brief Device data place configuration
+typedef struct stf_data_place_device
 {
-  int dev_id;
-};
+  int dev_id; //!< CUDA device ID for data placement
+} stf_data_place_device;
 
-struct stf_data_place_host
+//! \brief Host data place configuration
+typedef struct stf_data_place_host
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+  char dummy; //!< Dummy field for standard C compatibility
+} stf_data_place_host;
 
-struct stf_data_place_managed
+//! \brief Managed memory data place configuration
+typedef struct stf_data_place_managed
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+  char dummy; //!< Dummy field for standard C compatibility
+} stf_data_place_managed;
 
-struct stf_data_place_affine
+//! \brief Affine data place configuration
+//!
+//! Affine placement means data follows the execution location automatically.
+typedef struct stf_data_place_affine
 {
-  char dummy; /* dummy to keep it standard C which does not allow empty structs */
-};
+  char dummy; //!< Dummy field for standard C compatibility
+} stf_data_place_affine;
 
+//! \brief Data place type discriminator
 typedef enum stf_data_place_kind
 {
-  STF_DATA_PLACE_DEVICE,
-  STF_DATA_PLACE_HOST,
-  STF_DATA_PLACE_MANAGED,
-  STF_DATA_PLACE_AFFINE
+  STF_DATA_PLACE_DEVICE, //!< Data on specific device memory
+  STF_DATA_PLACE_HOST, //!< Data on host (CPU) memory
+  STF_DATA_PLACE_MANAGED, //!< Data in CUDA managed (unified) memory
+  STF_DATA_PLACE_AFFINE //!< Data follows execution place (default)
 } stf_data_place_kind;
 
-struct stf_data_place
+//! \brief Data placement specification
+//!
+//! Tagged union specifying where logical data should be located.
+//! Use helper functions to create (make_device_data_place(), etc.).
+typedef struct stf_data_place
 {
-  enum stf_data_place_kind kind;
+  enum stf_data_place_kind kind; //!< Type of data placement
   union
   {
-    struct stf_data_place_device device;
-    struct stf_data_place_host host;
-    struct stf_data_place_managed managed;
-    struct stf_data_place_affine affine;
-  } u;
-};
+    stf_data_place_device device; //!< Device placement configuration
+    stf_data_place_host host; //!< Host placement configuration
+    stf_data_place_managed managed; //!< Managed memory configuration
+    stf_data_place_affine affine; //!< Affine placement configuration
+  } u; //!< Configuration union
+} stf_data_place;
 
-static inline struct stf_data_place make_device_data_place(int dev_id)
+//! \brief Create data place for specific CUDA device
+//!
+//! \param dev_id CUDA device index (0-based)
+//! \return Data place configured for device memory
+//!
+//! \par Example:
+//! \code
+//! // Force data to device 1 even if task runs elsewhere
+//! stf_data_place dplace = make_device_data_place(1);
+//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace);
+//! \endcode
+static inline stf_data_place make_device_data_place(int dev_id)
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind            = STF_DATA_PLACE_DEVICE;
   p.u.device.dev_id = dev_id;
   return p;
 }
 
+//! \brief Create data place for host memory
+//!
+//! \return Data place configured for host (CPU) memory
+//!
+//! \par Example:
+//! \code
+//! // Keep data on host even for device tasks (sparse access)
+//! stf_data_place dplace = make_host_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_READ, &dplace);
+//! \endcode
 static inline struct stf_data_place make_host_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind         = STF_DATA_PLACE_HOST;
   p.u.host.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
+//!
+//! \brief Create data place for CUDA managed memory
+//!
+//! \return Data place configured for managed (unified) memory
+//!
+//! \par Example:
+//! \code
+//! // Use managed memory for flexible access patterns
+//! stf_data_place dplace = make_managed_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace);
+//! \endcode
+
 static inline struct stf_data_place make_managed_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind            = STF_DATA_PLACE_MANAGED;
   p.u.managed.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
+//!
+//! \brief Create affine data place (follows execution location)
+//!
+//! \return Data place configured for affine placement (default behavior)
+//!
+//! \par Example:
+//! \code
+//! // Explicitly specify default behavior
+//! stf_data_place dplace = make_affine_data_place();
+//! stf_task_add_dep_with_dplace(task, data, STF_RW, &dplace);
+//! \endcode
+
 static inline struct stf_data_place make_affine_data_place()
 {
-  struct stf_data_place p;
+  stf_data_place p;
   p.kind           = STF_DATA_PLACE_AFFINE;
   p.u.affine.dummy = 0; /* to avoid uninitialized memory warnings */
   return p;
 }
 
-typedef struct stf_data_place_device stf_data_place_device;
-typedef struct stf_data_place_host stf_data_place_host;
-typedef struct stf_data_place_managed stf_data_place_managed;
-typedef struct stf_data_place_affine stf_data_place_affine;
-typedef union stf_data_place_u stf_data_place_u;
-typedef struct stf_data_place stf_data_place;
+//! \}
+
+//! \defgroup Handles Opaque Handles
+//! \brief Opaque handle types for STF objects
+//! \{
+
+//!
+//! \brief Opaque handle for STF context
+//!
+//! Context stores the state of the STF library and serves as entry point for all API calls.
+//! Must be created with stf_ctx_create() or stf_ctx_create_graph() and destroyed with stf_ctx_finalize().
+
+typedef void* stf_ctx_handle;
+
+//!
+//! \brief Opaque handle for logical data
+//!
+//! Represents abstract data that may exist in multiple memory locations.
+//! Created with stf_logical_data() or stf_logical_data_empty() and destroyed with stf_logical_data_destroy().
+
+typedef void* stf_logical_data_handle;
+
+//!
+//! \brief Opaque handle for task
+//!
+//! Represents a computational task that operates on logical data.
+//! Created with stf_task_create() and destroyed with stf_task_destroy().
+
+typedef void* stf_task_handle;
+
+//!
+//! \brief Opaque handle for CUDA kernel task
+//!
+//! Specialized task optimized for CUDA kernel execution.
+//! Created with stf_cuda_kernel_create() and destroyed with stf_cuda_kernel_destroy().
+
+typedef void* stf_cuda_kernel_handle;
 
-typedef struct stf_ctx_handle_t* stf_ctx_handle;
+//! \}
+
+//! \defgroup Context Context Management
+//! \brief Create, configure, and finalize STF contexts
+//! \{
+
+//!
+//! \brief Create STF context with stream backend
+//!
+//! Creates a new STF context using the default stream-based backend.
+//! Tasks are executed eagerly using CUDA streams and events.
+//!
+//! \param[out] ctx Pointer to receive context handle
+//!
+//! \pre ctx must not be NULL
+//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize()
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... use context ...
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_create_graph(), stf_ctx_finalize()
 
 void stf_ctx_create(stf_ctx_handle* ctx);
-// TODO stf_ctx_create_with_flags and an enum instead ?
+
+//!
+//! \brief Create STF context with graph backend
+//!
+//! Creates a new STF context using the CUDA graph backend.
+//! Tasks are captured into CUDA graphs and launched when needed,
+//! potentially providing better performance for repeated patterns.
+//!
+//! \param[out] ctx Pointer to receive context handle
+//!
+//! \pre ctx must not be NULL
+//! \post *ctx contains valid context handle that must be finalized with stf_ctx_finalize()
+//!
+//! \note Graph backend has restrictions on stream synchronization within tasks
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create_graph(&ctx);
+//! // ... use context ...
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_create(), stf_ctx_finalize()
+
 void stf_ctx_create_graph(stf_ctx_handle* ctx);
+
+//!
+//! \brief Finalize STF context
+//!
+//! Waits for all pending operations to complete, performs write-back
+//! of modified data to host, and releases all associated resources.
+//!
+//! \param ctx Context handle to finalize
+//!
+//! \pre ctx must be valid context handle
+//! \post All pending operations completed, resources released, ctx becomes invalid
+//!
+//! \note This function blocks until all asynchronous operations complete
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... submit tasks ...
+//! stf_ctx_finalize(ctx);  // Blocks until completion
+//! \endcode
+//!
+//! \see stf_ctx_create(), stf_ctx_create_graph(), stf_fence()
+
 void stf_ctx_finalize(stf_ctx_handle ctx);
 
-// TODO stf_ctx_set_mode() + define enum with GRAPH, STREAM, ...
-// TODO stf_ctx_is_graph()
+//!
+//! \brief Get synchronization fence for context
+//!
+//! Returns a CUDA stream that will be signaled when all pending
+//! operations in the context complete. Provides non-blocking
+//! alternative to stf_ctx_finalize() for synchronization queries.
+//!
+//! \param ctx Context handle
+//! \return CUDA stream for synchronization
+//!
+//! \pre ctx must be valid context handle
+//!
+//! \par Example:
+//! \code
+//! stf_ctx_handle ctx;
+//! stf_ctx_create(&ctx);
+//! // ... submit tasks ...
+//!
+//! cudaStream_t fence = stf_fence(ctx);
+//! cudaStreamSynchronize(fence);  // Wait for completion
+//! stf_ctx_finalize(ctx);
+//! \endcode
+//!
+//! \see stf_ctx_finalize()
 
 cudaStream_t stf_fence(stf_ctx_handle ctx);
 
-typedef struct stf_logical_data_handle_t* stf_logical_data_handle;
+//! \}
+
+//! \defgroup LogicalData Logical Data Management
+//! \brief Create and manage abstract data handles
+//! \{
+
+//!
+//! \brief Create logical data from existing memory buffer
+//!
+//! Creates logical data handle from an existing host memory buffer.
+//! STF takes ownership of data management during task execution.
+//!
+//! \param ctx Context handle
+//! \param[out] ld Pointer to receive logical data handle
+//! \param addr Pointer to existing data buffer
+//! \param sz Size of data in bytes
+//!
+//! \pre ctx must be valid context handle
+//! \pre ld must not be NULL
+//! \pre addr must not be NULL
+//! \pre sz must be greater than 0
+//! \post *ld contains valid logical data handle
+//!
+//! \note Original data pointer should not be accessed during task execution
+//! \note Data will be written back when logical data is destroyed or context finalized
+//!
+//! \par Example:
+//! \code
+//! float data[1024];
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, sizeof(data));
+//! // ... use in tasks ...
+//! stf_logical_data_destroy(ld);
+//! \endcode
+//!
+//! \see stf_logical_data_empty(), stf_logical_data_destroy()
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
+
+//!
+//! \brief Set symbolic name for logical data
+//!
+//! Associates a human-readable name with logical data for debugging
+//! and task graph visualization.
+//!
+//! \param ld Logical data handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre ld must be valid logical data handle
+//! \pre symbol must not be NULL
+//!
+//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, size);
+//! stf_logical_data_set_symbol(ld, "input_matrix");
+//! \endcode
+//!
+//! \see stf_task_set_symbol()
+
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol);
+
+//!
+//! \brief Destroy logical data handle
+//!
+//! Destroys logical data handle and releases associated resources.
+//! Triggers write-back to host if data was modified.
+//!
+//! \param ld Logical data handle to destroy
+//!
+//! \pre ld must be valid logical data handle
+//! \post ld becomes invalid, resources released
+//!
+//! \note Must be called for every created logical data handle
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle ld;
+//! stf_logical_data(ctx, &ld, data, size);
+//! // ... use in tasks ...
+//! stf_logical_data_destroy(ld);  // Cleanup
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_empty()
+
 void stf_logical_data_destroy(stf_logical_data_handle ld);
+
+//!
+//! \brief Create empty logical data (temporary)
+//!
+//! Creates logical data of specified size without backing host memory.
+//! Useful for temporary buffers in multi-stage computations.
+//!
+//! \param ctx Context handle
+//! \param length Size in bytes
+//! \param[out] to Pointer to receive logical data handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre length must be greater than 0
+//! \pre to must not be NULL
+//! \post *to contains valid logical data handle
+//!
+//! \note First access must be write-only (STF_WRITE)
+//! \note No write-back occurs since there's no host backing
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle temp;
+//! stf_logical_data_empty(ctx, 1024 * sizeof(float), &temp);
+//!
+//! // First access must be write-only
+//! stf_task_add_dep(task, temp, STF_WRITE);
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_destroy()
+
 void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to);
 
-// TODO
-// void stf_logical_data_wait(stf_logical_data_handle ld);
+//!
+//! \brief Create synchronization token
+//!
+//! Creates a logical data handle for synchronization purposes only.
+//! Contains no actual data but can be used to enforce execution order.
+//!
+//! \param ctx Context handle
+//! \param[out] ld Pointer to receive token handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre ld must not be NULL
+//! \post *ld contains valid token handle
+//!
+//! \note More efficient than using dummy data for synchronization
+//! \note Can be accessed with any access mode
+//!
+//! \par Example:
+//! \code
+//! stf_logical_data_handle sync_token;
+//! stf_token(ctx, &sync_token);
+//!
+//! // Task 1 signals completion
+//! stf_task_add_dep(task1, sync_token, STF_WRITE);
+//!
+//! // Task 2 waits for task1
+//! stf_task_add_dep(task2, sync_token, STF_READ);
+//! \endcode
+//!
+//! \see stf_logical_data(), stf_logical_data_destroy()
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
 
-typedef struct stf_task_handle_t* stf_task_handle;
+//! \}
+
+//! \defgroup TaskManagement Task Management
+//! \brief Create, configure, and execute computational tasks
+//! \{
+
+//!
+//! \brief Create new task
+//!
+//! Creates a new task within the specified context. Task is created
+//! but not configured or executed. Use other stf_task_* functions
+//! to configure execution place, add dependencies, and execute.
+//!
+//! \param ctx Context handle
+//! \param[out] t Pointer to receive task handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre t must not be NULL
+//! \post *t contains valid task handle
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! // ... configure task ...
+//! stf_task_destroy(task);
+//! \endcode
+//!
+//! \see stf_task_destroy(), stf_task_set_exec_place(), stf_task_add_dep()
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t);
-void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p);
+
+//!
+//! \brief Set task execution place
+//!
+//! Specifies where the task should execute (device or host).
+//! If not called, defaults to current device.
+//!
+//! \param t Task handle
+//! \param exec_p Pointer to execution place specification
+//!
+//! \pre t must be valid task handle
+//! \pre exec_p must not be NULL
+//! \pre Must be called before stf_task_start()
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//!
+//! // Execute on device 1
+//! stf_exec_place place = make_device_place(1);
+//! stf_task_set_exec_place(task, &place);
+//! \endcode
+//!
+//! \see make_device_place(), make_host_place()
+
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p);
+
+//!
+//! \brief Set symbolic name for task
+//!
+//! Associates a human-readable name with task for debugging
+//! and task graph visualization.
+//!
+//! \param t Task handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre t must be valid task handle
+//! \pre symbol must not be NULL
+//!
+//! \note Symbol appears in DOT graph output when CUDASTF_DOT_FILE is set
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! stf_task_set_symbol(task, "matrix_multiply");
+//! \endcode
+//!
+//! \see stf_logical_data_set_symbol()
+
 void stf_task_set_symbol(stf_task_handle t, const char* symbol);
+
+//!
+//! \brief Add data dependency to task
+//!
+//! Adds a data dependency with specified access mode. Order of calls
+//! determines index for stf_task_get(). Dependencies determine
+//! automatic task synchronization.
+//!
+//! \param t Task handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//!
+//! \pre t must be valid task handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//!
+//! \par Example:
+//! \code
+//! stf_task_add_dep(task, input_data, STF_READ);    // Index 0
+//! stf_task_add_dep(task, output_data, STF_WRITE);  // Index 1
+//! stf_task_add_dep(task, temp_data, STF_RW);       // Index 2
+//! \endcode
+//!
+//! \see stf_task_add_dep_with_dplace(), stf_task_get()
+
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m);
+
+//!
+//! \brief Add data dependency with explicit data placement
+//!
+//! Adds data dependency with specified access mode and explicit
+//! data placement. Overrides default affine placement.
+//!
+//! \param t Task handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//! \param data_p Pointer to data place specification
+//!
+//! \pre t must be valid task handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//! \pre data_p must not be NULL
+//!
+//! \par Example:
+//! \code
+//! // Force data to device 0 even if task runs elsewhere
+//! stf_data_place dplace = make_device_data_place(0);
+//! stf_task_add_dep_with_dplace(task, ld, STF_READ, &dplace);
+//! \endcode
+//!
+//! \see stf_task_add_dep(), make_device_data_place(), make_host_data_place()
+
 void stf_task_add_dep_with_dplace(
-  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p);
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p);
+
+//!
+//! \brief Begin task execution
+//!
+//! Starts task execution. After this call, use stf_task_get_custream()
+//! and stf_task_get() to access CUDA stream and data pointers.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//! \pre Task dependencies must already be configured
+//! \post Task is executing, stream and data available
+//!
+//! \par Example:
+//! \code
+//! // Configure task first
+//! stf_task_add_dep(task, data, STF_RW);
+//!
+//! // Start execution
+//! stf_task_start(task);
+//!
+//! // Now can access stream and data
+//! CUstream stream = stf_task_get_custream(task);
+//! float* ptr = (float*)stf_task_get(task, 0);
+//! \endcode
+//!
+//! \see stf_task_end(), stf_task_get_custream(), stf_task_get()
+
 void stf_task_start(stf_task_handle t);
+
+//!
+//! \brief End task execution
+//!
+//! Ends task execution. Call after all CUDA operations are
+//! submitted to the task stream.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//! \post Task execution ended, may continue asynchronously
+//!
+//! \par Example:
+//! \code
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//!
+//! // Launch operations
+//! my_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(args...);
+//!
+//! stf_task_end(task);  // Operations may still be running
+//! \endcode
+//!
+//! \see stf_task_start()
+
 void stf_task_end(stf_task_handle t);
+
+//!
+//! \brief Get CUDA stream for task
+//!
+//! Returns CUDA stream associated with the task. All CUDA operations
+//! within task must use this stream for proper synchronization.
+//!
+//! \param t Task handle
+//! \return CUDA stream for launching operations
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//!
+//! \par Example:
+//! \code
+//! stf_task_start(task);
+//! CUstream stream = stf_task_get_custream(task);
+//!
+//! // Launch kernel using this stream
+//! my_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(args...);
+//! \endcode
+//!
+//! \see stf_task_start(), stf_task_get()
+
 CUstream stf_task_get_custream(stf_task_handle t);
+
+//!
+//! \brief Get data pointer for task dependency
+//!
+//! Returns pointer to logical data instance for specified dependency.
+//! Index corresponds to order of stf_task_add_dep() calls.
+//!
+//! \param t Task handle
+//! \param submitted_index Dependency index (0-based)
+//! \return Pointer to data (cast to appropriate type)
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//! \pre submitted_index must be valid dependency index
+//! \post Pointer valid until stf_task_end()
+//!
+//! \par Example:
+//! \code
+//! // Dependencies added in this order:
+//! stf_task_add_dep(task, input, STF_READ);     // Index 0
+//! stf_task_add_dep(task, output, STF_WRITE);   // Index 1
+//!
+//! stf_task_start(task);
+//!
+//! // Get data pointers
+//! const float* in = (const float*)stf_task_get(task, 0);
+//! float* out = (float*)stf_task_get(task, 1);
+//! \endcode
+//!
+//! \see stf_task_add_dep(), stf_task_start()
+
 void* stf_task_get(stf_task_handle t, int submitted_index);
+
+//!
+//! \brief Destroy task handle
+//!
+//! Destroys task handle and releases associated resources.
+//! Task should be completed before destruction.
+//!
+//! \param t Task handle to destroy
+//!
+//! \pre t must be valid task handle
+//! \post t becomes invalid, resources released
+//!
+//! \note Must be called for every created task
+//!
+//! \par Example:
+//! \code
+//! stf_task_handle task;
+//! stf_task_create(ctx, &task);
+//! // ... configure and execute task ...
+//! stf_task_destroy(task);
+//! \endcode
+//!
+//! \see stf_task_create()
+
 void stf_task_destroy(stf_task_handle t);
+
+//!
+//! \brief Enable graph capture for task (advanced)
+//!
+//! Enables graph capture optimization for the task.
+//! Advanced feature typically not needed for basic usage.
+//!
+//! \param t Task handle
+//!
+//! \pre t must be valid task handle
+//!
+//! \note Used internally for CUDA graph backend optimization
+
 void stf_task_enable_capture(stf_task_handle t);
 
-typedef struct stf_cuda_kernel_handle_t* stf_cuda_kernel_handle;
+//! \}
+
+//! \defgroup CUDAKernel CUDA Kernel Interface
+//! \brief Optimized interface for CUDA kernel execution
+//! \{
+
+//!
+//! \brief Create CUDA kernel task
+//!
+//! Creates a specialized task optimized for CUDA kernel execution.
+//! More efficient than generic tasks for repeated kernel launches,
+//! especially with CUDA graph backend.
+//!
+//! \param ctx Context handle
+//! \param[out] k Pointer to receive kernel handle
+//!
+//! \pre ctx must be valid context handle
+//! \pre k must not be NULL
+//! \post *k contains valid kernel handle
+//!
+//! \par Example:
+//! \code
+//! stf_cuda_kernel_handle kernel;
+//! stf_cuda_kernel_create(ctx, &kernel);
+//! // ... configure kernel ...
+//! stf_cuda_kernel_destroy(kernel);
+//! \endcode
+//!
+//! \see stf_cuda_kernel_destroy(), stf_task_create()
 
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k);
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p);
+
+//!
+//! \brief Set kernel execution place
+//!
+//! Specifies where the CUDA kernel should execute.
+//!
+//! \param k Kernel handle
+//! \param exec_p Pointer to execution place specification
+//!
+//! \pre k must be valid kernel handle
+//! \pre exec_p must not be NULL
+//!
+//! \see make_device_place(), stf_task_set_exec_place()
+
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p);
+
+//!
+//! \brief Set symbolic name for kernel
+//!
+//! Associates human-readable name with kernel for debugging.
+//!
+//! \param k Kernel handle
+//! \param symbol Null-terminated string name
+//!
+//! \pre k must be valid kernel handle
+//! \pre symbol must not be NULL
+//!
+//! \see stf_task_set_symbol(), stf_logical_data_set_symbol()
+
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol);
+
+//!
+//! \brief Add data dependency to kernel
+//!
+//! Adds data dependency with specified access mode for kernel execution.
+//!
+//! \param k Kernel handle
+//! \param ld Logical data handle
+//! \param m Access mode (STF_READ, STF_WRITE, STF_RW)
+//!
+//! \pre k must be valid kernel handle
+//! \pre ld must be valid logical data handle
+//! \pre m must be valid access mode
+//!
+//! \see stf_task_add_dep()
+
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m);
+
+//!
+//! \brief Start kernel execution
+//!
+//! Begins kernel execution phase. After this, add kernel descriptions
+//! with stf_cuda_kernel_add_desc().
+//!
+//! \param k Kernel handle
+//!
+//! \pre k must be valid kernel handle
+//! \pre Dependencies must already be configured
+//!
+//! \see stf_cuda_kernel_add_desc(), stf_cuda_kernel_end()
+
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k);
 
+//!
+//! \brief Add CUDA kernel launch description (driver API)
+//!
+//! Adds kernel launch specification using CUDA driver API function handle.
+//! This is the low-level interface used internally.
+//!
+//! \param k Kernel handle
+//! \param cufunc CUDA driver API function handle
+//! \param grid_dim_ CUDA grid dimensions
+//! \param block_dim_ CUDA block dimensions
+//! \param shared_mem_ Shared memory size in bytes
+//! \param arg_cnt Number of kernel arguments
+//! \param args Array of pointers to kernel arguments
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre cufunc must be valid CUfunction
+//! \pre args must contain arg_cnt valid argument pointers
+//!
+//! \see stf_cuda_kernel_add_desc()
+
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args);
 
-/* Convert CUDA kernel address to CUfunction because we may use them from a
- * shared library where this would be invalid in the runtime API. */
-static inline void stf_cuda_kernel_add_desc(
+//!
+//! \brief Add CUDA kernel launch description
+//!
+//! Adds kernel launch specification using runtime API function pointer.
+//! Automatically converts to driver API internally.
+//!
+//! \param k Kernel handle
+//! \param func Pointer to __global__ function
+//! \param grid_dim_ CUDA grid dimensions
+//! \param block_dim_ CUDA block dimensions
+//! \param shared_mem_ Shared memory size in bytes
+//! \param arg_cnt Number of kernel arguments
+//! \param args Array of pointers to kernel arguments
+//!
+//! \return cudaSuccess on success, or appropriate cudaError_t on failure
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre func must be valid __global__ function pointer
+//! \pre args must contain arg_cnt valid argument pointers
+//!
+//! \note Converts function pointer to CUfunction automatically
+//!
+//! \par Example:
+//! \code
+//! // Kernel: __global__ void axpy(float alpha, float* x, float* y)
+//! stf_cuda_kernel_start(kernel);
+//!
+//! // Prepare arguments
+//! float alpha = 2.0f;
+//! float* d_x = (float*)stf_cuda_kernel_get_arg(kernel, 0);
+//! float* d_y = (float*)stf_cuda_kernel_get_arg(kernel, 1);
+//! const void* args[] = {&alpha, &d_x, &d_y};
+//!
+//! // Launch kernel (caller must handle return values != cudaSuccess)
+//! cudaError_t err = stf_cuda_kernel_add_desc(kernel, (void*)axpy,
+//!                                           dim3(16), dim3(128), 0, 3, args);
+//! stf_cuda_kernel_end(kernel);
+//! \endcode
+//!
+//! \see stf_cuda_kernel_add_desc_cufunc(), stf_cuda_kernel_get_arg()
+
+static inline cudaError_t stf_cuda_kernel_add_desc(
   stf_cuda_kernel_handle k,
   const void* func,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args)
 {
   CUfunction cufunc;
-  [[maybe_unused]] cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
-  assert(res == cudaSuccess);
+  cudaError_t res = cudaGetFuncBySymbol(&cufunc, func);
+  if (res != cudaSuccess)
+  {
+    return res;
+  }
 
-  stf_cuda_kernel_add_desc_cufunc(k, cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+  stf_cuda_kernel_add_desc_cufunc(k, cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
+  return cudaSuccess;
 }
 
+//!
+//! \brief Get kernel argument data pointer
+//!
+//! Returns pointer to logical data for use as kernel argument.
+//! Index corresponds to order of stf_cuda_kernel_add_dep() calls.
+//!
+//! \param k Kernel handle
+//! \param index Dependency index (0-based)
+//! \return Pointer to data for kernel argument
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//! \pre index must be valid dependency index
+//!
+//! \see stf_cuda_kernel_add_desc(), stf_task_get()
+
 void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index);
+
+//!
+//! \brief End kernel execution
+//!
+//! Ends kernel execution phase. Call after all kernel descriptions
+//! are added with stf_cuda_kernel_add_desc().
+//!
+//! \param k Kernel handle
+//!
+//! \pre k must be valid kernel handle
+//! \pre stf_cuda_kernel_start() must have been called
+//!
+//! \see stf_cuda_kernel_start()
+
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k);
-void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t);
+
+//!
+//! \brief Destroy kernel handle
+//!
+//! Destroys kernel handle and releases associated resources.
+//!
+//! \param k Kernel handle to destroy
+//!
+//! \pre k must be valid kernel handle
+//! \post k becomes invalid, resources released
+//!
+//! \note Must be called for every created kernel
+//!
+//! \see stf_cuda_kernel_create()
+
+void stf_cuda_kernel_destroy(stf_cuda_kernel_handle k);
+
+//! \}
 
 #ifdef __cplusplus
 }
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 0a92d86b677..14a03de7f5a 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cccl/c/experimental/stf/stf.h>
 // #include <cccl/c/parallel/include/cccl/c/extern_c.h>
 #include <cuda/experimental/stf.cuh>
@@ -6,69 +16,60 @@ using namespace cuda::experimental::stf;
 
 extern "C" {
 
-struct stf_ctx_handle_t
-{
-  context ctx;
-};
-
-struct stf_logical_data_handle_t
-{
-  // XXX should we always store a logical_data<slice<char>> instead ?
-  logical_data_untyped ld;
-};
-
-struct stf_task_handle_t
-{
-  context::unified_task<> t;
-};
-
 void stf_ctx_create(stf_ctx_handle* ctx)
 {
   assert(ctx);
-  *ctx = new stf_ctx_handle_t{context{}};
+  *ctx = new context{};
 }
 
 void stf_ctx_create_graph(stf_ctx_handle* ctx)
 {
   assert(ctx);
-  *ctx = new stf_ctx_handle_t{context{graph_ctx()}};
+  *ctx = new context{graph_ctx()};
 }
 
 void stf_ctx_finalize(stf_ctx_handle ctx)
 {
-  ctx->ctx.finalize();
   assert(ctx);
-  delete ctx;
+  auto* context_ptr = static_cast<context*>(ctx);
+  context_ptr->finalize();
+  delete context_ptr;
 }
 
 cudaStream_t stf_fence(stf_ctx_handle ctx)
 {
   assert(ctx);
-  return ctx->ctx.fence();
+  auto* context_ptr = static_cast<context*>(ctx);
+  return context_ptr->fence();
 }
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
 {
-  assert(ld);
   assert(ctx);
+  assert(ld);
 
-  // Create a slice<char> logical data
-  auto ld_typed = ctx->ctx.logical_data(make_slice((char*) addr, sz));
+  auto* context_ptr = static_cast<context*>(ctx);
+  auto ld_typed     = context_ptr->logical_data(make_slice((char*) addr, sz));
 
-  // Stored in its untyped version
-  *ld = new stf_logical_data_handle_t{ld_typed};
+  // Store the logical_data_untyped directly as opaque pointer
+  *ld = new logical_data_untyped{ld_typed};
 }
 
 void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
 {
   assert(ld);
-  ld->ld.set_symbol(symbol);
+  assert(symbol);
+
+  auto* ld_ptr = static_cast<logical_data_untyped*>(ld);
+  ld_ptr->set_symbol(symbol);
 }
 
 void stf_logical_data_destroy(stf_logical_data_handle ld)
 {
   assert(ld);
-  delete ld;
+
+  auto* ld_ptr = static_cast<logical_data_untyped*>(ld);
+  delete ld_ptr;
 }
 
 void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle* to)
@@ -76,8 +77,9 @@ void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_
   assert(ctx);
   assert(to);
 
-  auto ld_typed = ctx->ctx.logical_data(shape_of<slice<char>>(length));
-  *to           = new stf_logical_data_handle_t{ld_typed};
+  auto* context_ptr = static_cast<context*>(ctx);
+  auto ld_typed     = context_ptr->logical_data(shape_of<slice<char>>(length));
+  *to               = new logical_data_untyped{ld_typed};
 }
 
 void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
@@ -85,57 +87,70 @@ void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld)
   assert(ctx);
   assert(ld);
 
-  *ld = new stf_logical_data_handle_t{ctx->ctx.token()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  *ld               = new logical_data_untyped{context_ptr->token()};
 }
 
 /* Convert the C-API stf_exec_place to a C++ exec_place object */
-exec_place to_exec_place(struct stf_exec_place* exec_p)
+exec_place to_exec_place(stf_exec_place* exec_p)
 {
-  if (exec_p->kind == STF_EXEC_PLACE_HOST)
+  assert(exec_p);
+
+  switch (exec_p->kind)
   {
-    return exec_place::host();
-  }
+    case STF_EXEC_PLACE_HOST:
+      return exec_place::host();
 
-  assert(exec_p->kind == STF_EXEC_PLACE_DEVICE);
-  return exec_place::device(exec_p->u.device.dev_id);
+    case STF_EXEC_PLACE_DEVICE:
+      return exec_place::device(exec_p->u.device.dev_id);
+
+    default:
+      assert(false && "Invalid execution place kind");
+      return exec_place{}; // invalid exec_place
+  }
 }
 
 /* Convert the C-API stf_data_place to a C++ data_place object */
-data_place to_data_place(struct stf_data_place* data_p)
+data_place to_data_place(stf_data_place* data_p)
 {
   assert(data_p);
 
-  if (data_p->kind == STF_DATA_PLACE_HOST)
+  switch (data_p->kind)
   {
-    return data_place::host();
-  }
+    case STF_DATA_PLACE_HOST:
+      return data_place::host();
 
-  if (data_p->kind == STF_DATA_PLACE_MANAGED)
-  {
-    return data_place::managed();
-  }
+    case STF_DATA_PLACE_MANAGED:
+      return data_place::managed();
 
-  if (data_p->kind == STF_DATA_PLACE_AFFINE)
-  {
-    return data_place::affine();
-  }
+    case STF_DATA_PLACE_AFFINE:
+      return data_place::affine();
 
-  assert(data_p->kind == STF_DATA_PLACE_DEVICE);
-  return data_place::device(data_p->u.device.dev_id);
+    case STF_DATA_PLACE_DEVICE:
+      return data_place::device(data_p->u.device.dev_id);
+
+    default:
+      assert(false && "Invalid data place kind");
+      return data_place::invalid(); // invalid data_place
+  }
 }
 
 void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
 {
-  assert(t);
   assert(ctx);
+  assert(t);
 
-  *t = new stf_task_handle_t{ctx->ctx.task()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  *t                = new context::unified_task<>{context_ptr->task()};
 }
 
-void stf_task_set_exec_place(stf_task_handle t, struct stf_exec_place* exec_p)
+void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
 {
   assert(t);
-  t->t.set_exec_place(to_exec_place(exec_p));
+  assert(exec_p);
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->set_exec_place(to_exec_place(exec_p));
 }
 
 void stf_task_set_symbol(stf_task_handle t, const char* symbol)
@@ -143,7 +158,8 @@ void stf_task_set_symbol(stf_task_handle t, const char* symbol)
   assert(t);
   assert(symbol);
 
-  t->t.set_symbol(symbol);
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->set_symbol(symbol);
 }
 
 void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
@@ -151,54 +167,70 @@ void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_
   assert(t);
   assert(ld);
 
-  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto* ld_ptr   = static_cast<logical_data_untyped*>(ld);
+  task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m)));
 }
 
 void stf_task_add_dep_with_dplace(
-  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, struct stf_data_place* data_p)
+  stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p)
 {
   assert(t);
   assert(ld);
   assert(data_p);
 
-  t->t.add_deps(task_dep_untyped(ld->ld, access_mode(m), to_data_place(data_p)));
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto* ld_ptr   = static_cast<logical_data_untyped*>(ld);
+  task_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m), to_data_place(data_p)));
 }
 
 void* stf_task_get(stf_task_handle t, int index)
 {
   assert(t);
-  auto s = t->t.template get<slice<const char>>(index);
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  auto s         = task_ptr->template get<slice<const char>>(index);
   return (void*) s.data_handle();
 }
 
 void stf_task_start(stf_task_handle t)
 {
   assert(t);
-  t->t.start();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->start();
 }
 
 void stf_task_end(stf_task_handle t)
 {
   assert(t);
-  t->t.end();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->end();
 }
 
 void stf_task_enable_capture(stf_task_handle t)
 {
   assert(t);
-  t->t.enable_capture();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  task_ptr->enable_capture();
 }
 
 CUstream stf_task_get_custream(stf_task_handle t)
 {
   assert(t);
-  return (CUstream) t->t.get_stream();
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  return (CUstream) task_ptr->get_stream();
 }
 
 void stf_task_destroy(stf_task_handle t)
 {
   assert(t);
-  delete t;
+
+  auto* task_ptr = static_cast<context::unified_task<>*>(t);
+  delete task_ptr;
 }
 
 /**
@@ -218,25 +250,24 @@ void stf_task_destroy(stf_task_handle t)
 
  *
  */
-struct stf_cuda_kernel_handle_t
-{
-  // return type of ctx.cuda_kernel()
-  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
-  kernel_type k;
-};
-
 void stf_cuda_kernel_create(stf_ctx_handle ctx, stf_cuda_kernel_handle* k)
 {
-  assert(k);
   assert(ctx);
+  assert(k);
 
-  *k = new stf_cuda_kernel_handle_t{ctx->ctx.cuda_kernel()};
+  auto* context_ptr = static_cast<context*>(ctx);
+  using kernel_type = decltype(context_ptr->cuda_kernel());
+  *k                = new kernel_type{context_ptr->cuda_kernel()};
 }
 
-void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, struct stf_exec_place* exec_p)
+void stf_cuda_kernel_set_exec_place(stf_cuda_kernel_handle k, stf_exec_place* exec_p)
 {
   assert(k);
-  k->k.set_exec_place(to_exec_place(exec_p));
+  assert(exec_p);
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->set_exec_place(to_exec_place(exec_p));
 }
 
 void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
@@ -244,7 +275,9 @@ void stf_cuda_kernel_set_symbol(stf_cuda_kernel_handle k, const char* symbol)
   assert(k);
   assert(symbol);
 
-  k->k.set_symbol(symbol);
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->set_symbol(symbol);
 }
 
 void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle ld, stf_access_mode m)
@@ -252,46 +285,66 @@ void stf_cuda_kernel_add_dep(stf_cuda_kernel_handle k, stf_logical_data_handle l
   assert(k);
   assert(ld);
 
-  k->k.add_deps(task_dep_untyped(ld->ld, access_mode(m)));
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  auto* ld_ptr      = static_cast<logical_data_untyped*>(ld);
+  kernel_ptr->add_deps(task_dep_untyped(*ld_ptr, access_mode(m)));
 }
 
 void stf_cuda_kernel_start(stf_cuda_kernel_handle k)
 {
   assert(k);
-  k->k.start();
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->start();
 }
 
 void stf_cuda_kernel_add_desc_cufunc(
   stf_cuda_kernel_handle k,
   CUfunction cufunc,
-  dim3 gridDim_,
-  dim3 blockDim_,
-  size_t sharedMem_,
+  dim3 grid_dim_,
+  dim3 block_dim_,
+  size_t shared_mem_,
   int arg_cnt,
   const void** args)
 {
-  cuda_kernel_desc desc;
-  desc.configure_raw(cufunc, gridDim_, blockDim_, sharedMem_, arg_cnt, args);
+  assert(k);
 
-  k->k.add_kernel_desc(mv(desc));
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+
+  cuda_kernel_desc desc;
+  desc.configure_raw(cufunc, grid_dim_, block_dim_, shared_mem_, arg_cnt, args);
+  kernel_ptr->add_kernel_desc(mv(desc));
 }
 
 void* stf_cuda_kernel_get_arg(stf_cuda_kernel_handle k, int index)
 {
-  auto s = k->k.template get<slice<const char>>(index);
+  assert(k);
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  auto s            = kernel_ptr->template get<slice<const char>>(index);
   return (void*) s.data_handle();
 }
 
 void stf_cuda_kernel_end(stf_cuda_kernel_handle k)
 {
   assert(k);
-  k->k.end();
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(k);
+  kernel_ptr->end();
 }
 
 void stf_cuda_kernel_destroy(stf_cuda_kernel_handle t)
 {
   assert(t);
-  delete t;
+
+  using kernel_type = decltype(::std::declval<context>().cuda_kernel());
+  auto* kernel_ptr  = static_cast<kernel_type*>(t);
+  delete kernel_ptr;
 }
 
 } // extern "C"
diff --git a/c/experimental/stf/test/CMakeLists.txt b/c/experimental/stf/test/CMakeLists.txt
index f5613253a81..5776747de79 100644
--- a/c/experimental/stf/test/CMakeLists.txt
+++ b/c/experimental/stf/test/CMakeLists.txt
@@ -18,13 +18,6 @@ function(cccl_c_experimental_stf_add_test target_name_var source)
     CCCL::cudax
   )
 
-  target_compile_definitions(${target_name} PRIVATE
-    TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
-    TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/thrust"
-    TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
-    TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}"
-  )
-
   add_test(NAME ${target_name} COMMAND ${target_name})
 endfunction()
 
diff --git a/c/experimental/stf/test/test_cuda_kernel.cu b/c/experimental/stf/test/test_cuda_kernel.cu
index b5ba66b0f3a..05c0e7e8620 100644
--- a/c/experimental/stf/test/test_cuda_kernel.cu
+++ b/c/experimental/stf/test/test_cuda_kernel.cu
@@ -70,7 +70,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
   double* dX          = (double*) stf_cuda_kernel_get_arg(k, 0);
   double* dY          = (double*) stf_cuda_kernel_get_arg(k, 1);
   const void* args[4] = {&N, &alpha, &dX, &dY};
-  stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  cudaError_t err     = stf_cuda_kernel_add_desc(k, (void*) axpy, 2, 4, 0, 4, args);
+  REQUIRE(err == cudaSuccess);
   stf_cuda_kernel_end(k);
   stf_cuda_kernel_destroy(k);
 
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index c7179d2a6fc..9f3e3925afb 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs):
         for i, a in enumerate(args):
             # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
-                if ctx == None:
+                if ctx is None:
                     ld = a.get_ld()
                     # This context will be used in the __call__ method itself
                     # so we can create a temporary object from the handle
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 49605ced878..58fbb5e0f78 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -6,7 +6,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.cccl.experimental.stf as cudastf  # noqa: E402
 
 
 @cudastf.jit
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index e2d38308341..acb2e2ec3cb 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.cccl.experimental.stf as cudastf  # noqa: E402
 
 
 class Plaintext:
@@ -36,7 +36,6 @@ def print_values(self):
         with ctx.task(
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
-            nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 8adbf5454ed..ad23b57a4c4 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.cccl.experimental.stf as cudastf  # noqa: E402
 
 
 class Plaintext:
@@ -36,7 +36,6 @@ def print_values(self):
         with ctx.task(
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
-            nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index a992d26d7d2..7b9050c9694 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -11,7 +11,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
     context,
     data_place,
     exec_place,
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 8c1349b89e5..b604558695e 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -12,7 +12,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
     context,
     rw,
 )
@@ -47,14 +47,12 @@ def test_pytorch():
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
         tX, tY = t.tensor_arguments()
-        tZ = tX * 4 + 1
 
     with (
         ctx.task(lY.read(), lZ.rw()) as t,
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
-        tX, tZ = t.tensor_arguments()
-        tZ = tY * 2 - 3
+        tX, _ = t.tensor_arguments()
 
     ctx.finalize()
 
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index 8e52a72f00a..b1e4de75213 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -5,7 +5,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.cccl.experimental.stf as cudastf  # noqa: E402
 
 
 @cudastf.jit

From c00c915f85c894e9216693c322afa8ca032f6d5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:05:29 +0200
Subject: [PATCH 165/221] Revert Python linting changes

Keep only the C library updates from stf_c_lib merge.
Revert all Python file modifications to maintain original code.
---
 python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py | 2 +-
 python/cuda_cccl/tests/stf/test_decorator.py             | 2 +-
 python/cuda_cccl/tests/stf/test_fhe.py                   | 3 ++-
 python/cuda_cccl/tests/stf/test_fhe_decorator.py         | 3 ++-
 python/cuda_cccl/tests/stf/test_numba.py                 | 2 +-
 python/cuda_cccl/tests/stf/test_pytorch.py               | 6 ++++--
 python/cuda_cccl/tests/stf/test_stencil_decorator.py     | 2 +-
 7 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
index 9f3e3925afb..c7179d2a6fc 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
@@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs):
         for i, a in enumerate(args):
             # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
-                if ctx is None:
+                if ctx == None:
                     ld = a.get_ld()
                     # This context will be used in the __call__ method itself
                     # so we can create a temporary object from the handle
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 58fbb5e0f78..49605ced878 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -6,7 +6,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf  # noqa: E402
+import cuda.cccl.experimental.stf as cudastf
 
 
 @cudastf.jit
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index acb2e2ec3cb..e2d38308341 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf  # noqa: E402
+import cuda.cccl.experimental.stf as cudastf
 
 
 class Plaintext:
@@ -36,6 +36,7 @@ def print_values(self):
         with ctx.task(
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index ad23b57a4c4..8adbf5454ed 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf  # noqa: E402
+import cuda.cccl.experimental.stf as cudastf
 
 
 class Plaintext:
@@ -36,6 +36,7 @@ def print_values(self):
         with ctx.task(
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 7b9050c9694..a992d26d7d2 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -11,7 +11,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
+from cuda.cccl.experimental.stf._stf_bindings import (
     context,
     data_place,
     exec_place,
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index b604558695e..8c1349b89e5 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -12,7 +12,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
+from cuda.cccl.experimental.stf._stf_bindings import (
     context,
     rw,
 )
@@ -47,12 +47,14 @@ def test_pytorch():
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
         tX, tY = t.tensor_arguments()
+        tZ = tX * 4 + 1
 
     with (
         ctx.task(lY.read(), lZ.rw()) as t,
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
-        tX, _ = t.tensor_arguments()
+        tX, tZ = t.tensor_arguments()
+        tZ = tY * 2 - 3
 
     ctx.finalize()
 
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index b1e4de75213..8e52a72f00a 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -5,7 +5,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf  # noqa: E402
+import cuda.cccl.experimental.stf as cudastf
 
 
 @cudastf.jit

From cdd0d85d489222e2cc90bd4239bbbf532a3f2f27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:07:52 +0200
Subject: [PATCH 166/221] Fix Python CMakeLists.txt: Update C library feature
 flags

- Replace CCCL_ENABLE_C with CCCL_ENABLE_C_PARALLEL and CCCL_ENABLE_C_EXPERIMENTAL_STF
- Align with updated C library configuration from stf_c_lib
- Fixes CMake target resolution for cccl.c.parallel and cccl.c.experimental.stf
---
 python/cuda_cccl/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 0b3f99edc54..31e32d659c4 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,7 +25,8 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
 cccl_build_compiler_targets()
 
 # Build and install C++ library first
-set(CCCL_ENABLE_C ON)
+set(CCCL_ENABLE_C_PARALLEL ON)
+set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)
 set(CCCL_ENABLE_UNSTABLE ON)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})

From afda29fb445faac008e4f7b30ad3b8228e5ec9c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:09:29 +0200
Subject: [PATCH 167/221] Fix Python build: Add missing CCCL_ENABLE_C master
 flag

The c/ directory is only included if CCCL_ENABLE_C is ON.
Added master flag alongside specific C library feature flags.
---
 python/cuda_cccl/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 31e32d659c4..1daf9197976 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,9 +25,10 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
 cccl_build_compiler_targets()
 
 # Build and install C++ library first
-set(CCCL_ENABLE_C_PARALLEL ON)
-set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)
-set(CCCL_ENABLE_UNSTABLE ON)
+set(CCCL_ENABLE_C ON)                      # Master flag to enable c/ directory
+set(CCCL_ENABLE_C_PARALLEL ON)             # Enable C parallel library
+set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library  
+set(CCCL_ENABLE_UNSTABLE ON)               # Enable unstable features
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 add_subdirectory(${_cccl_root} _parent_cccl)

From 4f1f079a3830472c568ee335d1fa3e970c7f1ff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:10:07 +0200
Subject: [PATCH 168/221] Complete STF C library configuration: Enable all C
 library features and tests

- Added CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING for STF tests
- Added CCCL_C_PARALLEL_ENABLE_TESTING for parallel tests
- Full C library support now enabled for Python builds
---
 python/cuda_cccl/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 1daf9197976..14f643eee74 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -27,7 +27,9 @@ cccl_build_compiler_targets()
 # Build and install C++ library first
 set(CCCL_ENABLE_C ON)                      # Master flag to enable c/ directory
 set(CCCL_ENABLE_C_PARALLEL ON)             # Enable C parallel library
-set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library  
+set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library
+set(CCCL_C_PARALLEL_ENABLE_TESTING ON)     # Enable parallel C tests
+set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON)  # Enable STF C tests
 set(CCCL_ENABLE_UNSTABLE ON)               # Enable unstable features
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})

From ccfc41d35c079e1a2ee20ffa9557ed03e546510f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:12:30 +0200
Subject: [PATCH 169/221] Remove obsolete CCCL_ENABLE_C flag

CCCL_ENABLE_C was replaced by specific flags in stf_c_lib:
- c/ directory is now included if CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF
- No master CCCL_ENABLE_C flag needed anymore
---
 python/cuda_cccl/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 14f643eee74..1481430c112 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,9 +25,8 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
 cccl_build_compiler_targets()
 
 # Build and install C++ library first
-set(CCCL_ENABLE_C ON)                      # Master flag to enable c/ directory
-set(CCCL_ENABLE_C_PARALLEL ON)             # Enable C parallel library
-set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library
+set(CCCL_ENABLE_C_PARALLEL ON)             # Enable C parallel library (triggers c/ directory)
+set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library (triggers c/ directory)
 set(CCCL_C_PARALLEL_ENABLE_TESTING ON)     # Enable parallel C tests
 set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON)  # Enable STF C tests
 set(CCCL_ENABLE_UNSTABLE ON)               # Enable unstable features

From e4b8277af23d5c2df700f41faa455cda01f5dd12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:13:52 +0200
Subject: [PATCH 170/221] Update CMake configuration to match stf_c_lib
 structure

- Replace CCCL_ENABLE_C with specific CCCL_ENABLE_C_PARALLEL and CCCL_ENABLE_C_EXPERIMENTAL_STF flags
- Update c/ directory inclusion condition to use OR logic with specific flags
- Update CMakePresets.json to match new flag structure
- Remove obsolete cccl-c-stf preset (merged functionality)
---
 CMakeLists.txt    |  5 +++--
 CMakePresets.json | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0830d733b32..fe53b0daead 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,8 @@ option(CCCL_ENABLE_CUB "Enable the CUB developer build." OFF)
 option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." OFF)
 option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." OFF)
 option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." OFF)
-option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF)
+option(CCCL_ENABLE_C_PARALLEL "Enable CUDA C Parallel Library." OFF)
+option(CCCL_ENABLE_C_EXPERIMENTAL_STF "Enable CUDA C CUDASTF Library." OFF)
 
 if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   set(CCCL_ENABLE_BENCHMARKS OFF)
@@ -82,7 +83,7 @@ if (CCCL_ENABLE_UNSTABLE)
   add_subdirectory(cudax)
 endif()
 
-if (CCCL_ENABLE_C)
+if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF)
   add_subdirectory(c)
 endif()
 
diff --git a/CMakePresets.json b/CMakePresets.json
index 537e7ebc88a..e4e012c60d2 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -21,7 +21,8 @@
         "CCCL_ENABLE_CUDAX": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
-        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_PARALLEL": false,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
         "Thrust_ENABLE_INSTALL_RULES": true,
@@ -63,7 +64,8 @@
         "CCCL_ENABLE_TESTING": true,
         "CCCL_ENABLE_EXAMPLES": true,
         "CCCL_ENABLE_BENCHMARKS": true,
-        "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_PARALLEL": true,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
         "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true,
         "CUB_ENABLE_TESTING": true,
@@ -88,7 +90,8 @@
         "cudax_ENABLE_DIALECT_CPP17": true,
         "cudax_ENABLE_DIALECT_CPP20": true,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
-        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
+        "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true,
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
       }
     },
     {
@@ -270,11 +273,22 @@
       "displayName": "CCCL C Parallel Library",
       "inherits": "base",
       "cacheVariables": {
-        "CCCL_ENABLE_C": true,
+        "CCCL_ENABLE_C_PARALLEL": true,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
       }
     },
+    {
+      "name": "cccl-c-stf",
+      "displayName": "CCCL C CUDASTF Library",
+      "inherits": "base",
+      "cacheVariables": {
+        "CCCL_ENABLE_C_PARALLEL": false,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": true,
+        "CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING": true
+      }
+    },
     {
       "name": "packaging",
       "displayName": "CCCL Packaging Tests/Examples",
@@ -296,7 +310,7 @@
         "CCCL_ENABLE_THRUST": false,
         "CCCL_ENABLE_LIBCUDACXX": false,
         "CCCL_ENABLE_CUDAX": false,
-        "CCCL_ENABLE_C": false,
+        "CCCL_ENABLE_C_PARALLEL": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
         "CUB_ENABLE_EXAMPLES": false,
@@ -428,6 +442,10 @@
       "name": "cccl-c-parallel",
       "configurePreset": "cccl-c-parallel"
     },
+    {
+      "name": "cccl-c-stf",
+      "configurePreset": "cccl-c-stf"
+    },
     {
       "name": "packaging",
       "configurePreset": "packaging"
@@ -721,6 +739,11 @@
       "configurePreset": "cccl-c-parallel",
       "inherits": "base"
     },
+    {
+      "name": "cccl-c-stf",
+      "configurePreset": "cccl-c-stf",
+      "inherits": "base"
+    },
     {
       "name": "packaging",
       "configurePreset": "packaging",

From 6931fa8ba61773630920e8ea7f872a88b15ce7bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:16:37 +0200
Subject: [PATCH 171/221] Optimize Python build: Remove unnecessary C parallel
 library

- Set CCCL_ENABLE_C_PARALLEL OFF (not needed for STF Python bindings)
- Set testing flags OFF (testing belongs in CI, not Python build)
- Remove parallel library installation and binding sections
- Keep only STF C library and STF Python bindings
- Cleaner, faster Python build focused on STF functionality
---
 python/cuda_cccl/CMakeLists.txt | 46 ++++-----------------------------
 1 file changed, 5 insertions(+), 41 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 1481430c112..5db487e7eb2 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,12 +25,11 @@ include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
 cccl_build_compiler_targets()
 
 # Build and install C++ library first
-set(CCCL_ENABLE_C_PARALLEL ON)             # Enable C parallel library (triggers c/ directory)
+set(CCCL_ENABLE_C_PARALLEL OFF)            # Not needed for STF Python bindings
 set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library (triggers c/ directory)
-set(CCCL_C_PARALLEL_ENABLE_TESTING ON)     # Enable parallel C tests
-set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING ON)  # Enable STF C tests
+set(CCCL_C_PARALLEL_ENABLE_TESTING OFF)    # Testing belongs in CI, not Python build
+set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF)  # Testing belongs in CI, not Python build
 set(CCCL_ENABLE_UNSTABLE ON)               # Enable unstable features
-set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 add_subdirectory(${_cccl_root} _parent_cccl)
 
@@ -60,16 +59,7 @@ install(
     DESTINATION ${_dest_incl_dir}
 )
 
-# ensure the destination directory exists
-file(MAKE_DIRECTORY "cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl")
-
-# Install version-specific binaries
-install(
-    TARGETS cccl.c.parallel
-    DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR}/cccl
-)
-
-
+# ensure the destination directory exists  
 file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl")
 
 install(
@@ -112,33 +102,7 @@ endif()
 set(CYTHON_FLAGS "-3 -M -t -w \"${cuda_cccl_SOURCE_DIR}\"")
 string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
 
-message(STATUS "Using Cython ${CYTHON_VERSION}")
-set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/parallel/experimental/_bindings_impl.pyx")
-
-set(_generated_extension_src "${cuda_cccl_BINARY_DIR}/_bindings_impl.c")
-set(_depfile "${cuda_cccl_BINARY_DIR}/_bindings_impl.c.dep")
-
-# Custom Cython compilation command for version-specific target
-add_custom_command(
-    OUTPUT "${_generated_extension_src}"
-    COMMAND "${Python3_EXECUTABLE}" -m cython
-    ARGS ${CYTHON_FLAGS_LIST} "${pyx_source_file}" --output-file "${_generated_extension_src}"
-    DEPENDS "${pyx_source_file}"
-    DEPFILE "${_depfile}"
-    COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
-)
-
-set_source_files_properties("${_generated_extension_src}" PROPERTIES GENERATED TRUE)
-add_custom_target(cythonize_bindings_impl ALL
-    DEPENDS "${_generated_extension_src}"
-)
-
-Python3_add_library(_bindings_impl MODULE WITH_SOABI "${_generated_extension_src}")
-add_dependencies(_bindings_impl cythonize_bindings_impl)
-target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver)
-set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
-
-install(TARGETS _bindings_impl DESTINATION cuda/cccl/parallel/experimental/${CUDA_VERSION_DIR})
+# Only building STF bindings - parallel bindings not needed
 
 message(STATUS "STF Using Cython ${CYTHON_VERSION}")
 set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx")

From a1a113959ea7ccc47bf35a6f9a7c70a072555cc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:20:06 +0200
Subject: [PATCH 172/221] clang-format

---
 python/cuda_cccl/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 5db487e7eb2..0b34563303e 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -59,7 +59,7 @@ install(
     DESTINATION ${_dest_incl_dir}
 )
 
-# ensure the destination directory exists  
+# ensure the destination directory exists
 file(MAKE_DIRECTORY "cuda/cccl/experimental/stf/${CUDA_VERSION_DIR}/cccl")
 
 install(

From ecd9f4e586b94ee6327bb5ea3318f96a7d13bc1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:52:20 +0200
Subject: [PATCH 173/221] fix pytorch example

---
 python/cuda_cccl/tests/stf/test_pytorch.py | 50 +++++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 8c1349b89e5..3e2d03ac650 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -12,7 +12,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
     context,
     rw,
 )
@@ -33,31 +33,67 @@ def test_pytorch():
         torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
             tX = t.tensor_arguments()
-            tX = tX * 2
+            tX[:] = tX * 2  # In-place multiplication
 
     with ctx.task(lX.read(), lY.write()) as t:
         torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
         with torch.cuda.stream(torch_stream):
             tX = t.get_arg_as_tensor(0)
             tY = t.get_arg_as_tensor(1)
-            tY = tX * 2
+            tY[:] = tX * 2  # Copy result into tY tensor
 
     with (
         ctx.task(lX.read(), lZ.write()) as t,
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
-        tX, tY = t.tensor_arguments()
-        tZ = tX * 4 + 1
+        tX, tZ = t.tensor_arguments()  # Get tX and tZ tensors
+        tZ[:] = tX * 4 + 1  # Copy result into tZ tensor
 
     with (
         ctx.task(lY.read(), lZ.rw()) as t,
         torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
     ):
-        tX, tZ = t.tensor_arguments()
-        tZ = tY * 2 - 3
+        tY, tZ = t.tensor_arguments()  # Get tY and tZ tensors
+        tZ[:] = tY * 2 - 3  # Copy result into tZ tensor
 
     ctx.finalize()
 
+    # Verify results on host after finalize
+    print("Verifying results...")
+
+    # Expected values:
+    # X: 1.0 -> 2.0 (multiplied by 2)
+    # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
+    # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
+
+    expected_X = 2.0
+    expected_Y = 4.0
+    expected_Z = 5.0
+
+    # Check a few values to verify correctness
+    assert np.allclose(X[:10], expected_X), (
+        f"X mismatch: got {X[:10]}, expected {expected_X}"
+    )
+    assert np.allclose(Y[:10], expected_Y), (
+        f"Y mismatch: got {Y[:10]}, expected {expected_Y}"
+    )
+    assert np.allclose(Z[:10], expected_Z), (
+        f"Z mismatch: got {Z[:10]}, expected {expected_Z}"
+    )
+
+    # Check entire arrays
+    assert np.all(X == expected_X), (
+        f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}"
+    )
+    assert np.all(Y == expected_Y), (
+        f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}"
+    )
+    assert np.all(Z == expected_Z), (
+        f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}"
+    )
+
+    print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}")
+
 
 if __name__ == "__main__":
     print("Running CUDASTF examples...")

From 4b2ae75316789ab8e4af8f6f4ece9061d588fc7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 9 Sep 2025 13:56:48 +0200
Subject: [PATCH 174/221] use ascii symbols

---
 python/cuda_cccl/tests/stf/test_numba.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index a992d26d7d2..5e9c83153a0 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -88,19 +88,19 @@ def test_numba():
 @cuda.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):
     """
-    Compute a 5‑point Laplacian on u_in and write the result to u_out.
+    Compute a 5-point Laplacian on u_in and write the result to u_out.
 
-    Grid‑stride 2‑D kernel.  Assumes C‑contiguous (row‑major) inputs.
+    Grid-stride 2-D kernel.  Assumes C-contiguous (row-major) inputs.
     Boundary cells are copied unchanged.
     """
     coef_x = 1.0 / (dx * dx)
     coef_y = 1.0 / (dy * dy)
 
-    i, j = cuda.grid(2)  # i ↔ row (x‑index), j ↔ col (y‑index)
+    i, j = cuda.grid(2)  # i <-> row (x-index), j <-> col (y-index)
     nx, ny = u_in.shape
 
     if i >= nx or j >= ny:
-        return  # out‑of‑bounds threads do nothing
+        return  # out-of-bounds threads do nothing
 
     if 0 < i < nx - 1 and 0 < j < ny - 1:
         u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + (

From dcb3d39735749c9c0aeab142754e27c04e803f27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 10 Sep 2025 07:13:29 +0200
Subject: [PATCH 175/221] Cleanup some changes in the infra from a previous
 merge

---
 CMakeLists.txt    | 2 --
 CMakePresets.json | 1 +
 ci/matrix.yaml    | 3 ---
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5467357fabb..fe53b0daead 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,8 +87,6 @@ if (CCCL_ENABLE_C_PARALLEL OR CCCL_ENABLE_C_EXPERIMENTAL_STF)
   add_subdirectory(c)
 endif()
 
-
-
 if (CCCL_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 7872b32d57e..475d6852bb3 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -309,6 +309,7 @@
         "CCCL_ENABLE_LIBCUDACXX": false,
         "CCCL_ENABLE_CUDAX": false,
         "CCCL_ENABLE_C_PARALLEL": false,
+        "CCCL_ENABLE_C_EXPERIMENTAL_STF": false,
         "CCCL_ENABLE_TESTING": false,
         "CCCL_ENABLE_EXAMPLES": false,
         "CUB_ENABLE_EXAMPLES": false,
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 5a2d9715998..dc9f6fb0115 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -8,9 +8,6 @@ workflows:
   #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: '12.X', cxx: ['gcc12', 'clang16']}
   #
   override:
-    # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel'], gpu: ['l4']}
-    - {jobs: ['test'], project: ['cccl_c_stf'], gpu: ['l4']}
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:

From 1284eb2d16a1550cab8c32104475806f0b26af8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 10 Sep 2025 08:43:28 +0200
Subject: [PATCH 176/221] Implement logical_data_empty logical_data_zeros, and
 logical_data_full

---
 .../experimental/stf/_adapters/numba_utils.py |  84 +++++++++
 .../experimental/stf/_stf_bindings_impl.pyx   | 163 +++++++++++++++++-
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  |  54 ++----
 3 files changed, 261 insertions(+), 40 deletions(-)
 create mode 100644 python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
new file mode 100644
index 00000000000..cfa8b27a041
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
@@ -0,0 +1,84 @@
+"""
+Utilities for NUMBA-based STF operations.
+"""
+import numba
+from numba import cuda
+import numpy as np
+
+
+def init_logical_data(ctx, ld, value, data_place=None, exec_place=None):
+    """
+    Initialize a logical data with a constant value using CuPy's optimized fill.
+    
+    Parameters
+    ----------
+    ctx : context
+        STF context
+    ld : logical_data
+        Logical data to initialize
+    value : scalar
+        Value to fill the array with
+    data_place : data_place, optional
+        Data place for the initialization task
+    exec_place : exec_place, optional
+        Execution place for the fill operation
+    """
+    # Create write dependency with optional data place
+    dep_arg = ld.write(data_place) if data_place else ld.write()
+    
+    # Create task arguments - include exec_place if provided
+    task_args = []
+    if exec_place is not None:
+        task_args.append(exec_place)
+    task_args.append(dep_arg)
+    
+    with ctx.task(*task_args) as t:
+        # Get the array as a numba device array
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        array = t.numba_arguments()
+        
+        try:
+            # Use CuPy's optimized operations (much faster than custom kernels)
+            import cupy as cp
+            with cp.cuda.Stream(nb_stream):
+                cp_view = cp.asarray(array)
+                if value == 0 or value == 0.0:
+                    # Use CuPy's potentially optimized zero operation
+                    cp_view.fill(0)  # CuPy may have special optimizations for zero
+                else:
+                    # Use generic fill for non-zero values
+                    cp_view.fill(value)
+        except ImportError:
+            # Fallback to simple kernel if CuPy not available
+            _fill_with_simple_kernel(array, value, nb_stream)
+
+
+@cuda.jit
+def _fill_kernel_fallback(array, value):
+    """Fallback 1D kernel when CuPy is not available."""
+    idx = cuda.grid(1)
+    if idx < array.size:
+        array.flat[idx] = value
+
+
+@cuda.jit
+def _zero_kernel_fallback(array):
+    """Optimized fallback kernel for zero-filling when CuPy is not available."""
+    idx = cuda.grid(1)
+    if idx < array.size:
+        array.flat[idx] = 0
+
+
+def _fill_with_simple_kernel(array, value, stream):
+    """Fallback method using simple NUMBA kernel when CuPy unavailable."""
+    total_size = array.size
+    threads_per_block = 256
+    blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block
+    
+    if value == 0 or value == 0.0:
+        # Use the specialized zero kernel for potentially better performance
+        _zero_kernel_fallback[blocks_per_grid, threads_per_block, stream](array)
+    else:
+        # Use generic fill kernel for non-zero values
+        typed_value = array.dtype.type(value)
+        _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream](array, typed_value)
\ No newline at end of file
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index ceca943e155..084ea6476bd 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -523,9 +523,170 @@ cdef class context:
         """
         return logical_data(self, buf)
 
-    def logical_data_by_shape(self, shape, dtype):
+
+    def logical_data_empty(self, shape, dtype=None):
+        """
+        Create logical data with uninitialized values.
+
+        Equivalent to numpy.empty() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+
+        Returns
+        -------
+        logical_data
+            New logical data with uninitialized values
+
+        Examples
+        --------
+        >>> # Create uninitialized array (fast but contains garbage)
+        >>> ld = ctx.logical_data_empty((100, 100), dtype=np.float32)
+
+        >>> # Fast allocation without initialization
+        >>> ld = ctx.logical_data_empty((50, 50, 50))
+        """
+        if dtype is None:
+            dtype = np.float64
         return logical_data.init_by_shape(self, shape, dtype)
 
+    def logical_data_full(self, shape, fill_value, dtype=None, where=None, exec_place=None):
+        """
+        Create logical data initialized with a constant value.
+
+        Similar to numpy.full(), this creates a new logical data with the given
+        shape and fills it with fill_value.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        fill_value : scalar
+            Value to fill the array with
+        dtype : numpy.dtype, optional
+            Data type. If None, infer from fill_value.
+        where : data_place, optional
+            Data placement for initialization. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+            Note: exec_place.host() is not yet supported.
+
+        Returns
+        -------
+        logical_data
+            New logical data initialized with fill_value
+
+        Examples
+        --------
+        >>> # Create array filled with epsilon0 on current device
+        >>> ld = ctx.logical_data_full((100, 100), 8.85e-12, dtype=np.float64)
+
+        >>> # Create array on host memory
+        >>> ld = ctx.logical_data_full((50, 50), 1.0, where=data_place.host())
+
+        >>> # Create array on specific device, execute on device 1
+        >>> ld = ctx.logical_data_full((200, 200), 0.0, where=data_place.device(0),
+        ...                          exec_place=exec_place.device(1))
+        """
+        # Infer dtype from fill_value if not provided
+        if dtype is None:
+            dtype = np.array(fill_value).dtype
+        else:
+            dtype = np.dtype(dtype)
+
+        # Validate exec_place - host execution not yet supported
+        if exec_place is not None:
+            if hasattr(exec_place, 'kind') and exec_place.kind == "host":
+                raise NotImplementedError(
+                    "exec_place.host() is not yet supported for logical_data_full. "
+                    "Use exec_place.device() or omit exec_place parameter."
+                )
+
+        # Create empty logical data
+        ld = self.logical_data_empty(shape, dtype)
+
+        # Initialize with the specified value using NUMBA
+        # The numba code already handles None properly by calling ld.write() without data place
+        try:
+            from cuda.cccl.experimental.stf._adapters.numba_utils import init_logical_data
+            init_logical_data(self, ld, fill_value, where, exec_place)
+        except ImportError as e:
+            raise RuntimeError("NUMBA support is not available for logical_data_full") from e
+
+        return ld
+
+    def logical_data_zeros(self, shape, dtype=None, where=None, exec_place=None):
+        """
+        Create logical data filled with zeros.
+
+        Equivalent to numpy.zeros() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+        where : data_place, optional
+            Data placement. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+
+        Returns
+        -------
+        logical_data
+            New logical data filled with zeros
+
+        Examples
+        --------
+        >>> # Create zero-filled array
+        >>> ld = ctx.logical_data_zeros((100, 100), dtype=np.float32)
+
+        >>> # Create on host memory
+        >>> ld = ctx.logical_data_zeros((50, 50), where=data_place.host())
+        """
+        if dtype is None:
+            dtype = np.float64
+        return self.logical_data_full(shape, 0.0, dtype, where, exec_place)
+
+    def logical_data_ones(self, shape, dtype=None, where=None, exec_place=None):
+        """
+        Create logical data filled with ones.
+
+        Equivalent to numpy.ones() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+        where : data_place, optional
+            Data placement. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+
+        Returns
+        -------
+        logical_data
+            New logical data filled with ones
+
+        Examples
+        --------
+        >>> # Create ones-filled array
+        >>> ld = ctx.logical_data_ones((100, 100), dtype=np.float32)
+
+        >>> # Create on specific device
+        >>> ld = ctx.logical_data_ones((50, 50), exec_place=exec_place.device(1))
+        """
+        if dtype is None:
+            dtype = np.float64
+        return self.logical_data_full(shape, 1.0, dtype, where, exec_place)
+
     def task(self, *args):
         """
         Create a `task`
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 3d1d6b0d2df..7c3ca2a8215 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -44,18 +44,6 @@ def show_slice(t3d, plane="xy", index=None):
     plt.pause(0.01)
 
 
-def init_field(ctx, ld, value):
-    with (
-        ctx.task(ld.write()) as t,
-        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
-    ):
-        field = t.tensor_arguments()
-        if value == 0:
-            field.zero_()
-        else:
-            field.fill_(value)
-
-
 def fdtd_3d_pytorch(
     size_x: int = 150,
     size_y: int = 150,
@@ -74,34 +62,22 @@ def fdtd_3d_pytorch(
 ]:
     ctx = context()
 
-    # allocate fields
+    # allocate and initialize fields
     shape = (size_x, size_y, size_z)
-    #    ex_ = torch.zeros(shape, dtype=dtype, device=device)
-    lex = ctx.logical_data_by_shape(shape, np.float64)
-    ley = ctx.logical_data_by_shape(shape, np.float64)
-    lez = ctx.logical_data_by_shape(shape, np.float64)
-
-    # epsilon_ = torch.full(shape, float(epsilon0), dtype=np.float64, device=device)
-    # mu_ = torch.full(shape, float(mu0), dtype=np.float64, device=device)
-
-    lhx = ctx.logical_data_by_shape(shape, np.float64)
-    lhy = ctx.logical_data_by_shape(shape, np.float64)
-    lhz = ctx.logical_data_by_shape(shape, np.float64)
-
-    # lepsilon = ctx.logical_data()
-    # lmu = ctx.logical_data(mu_)
-    lepsilon = ctx.logical_data_by_shape(shape, np.float64)
-    lmu = ctx.logical_data_by_shape(shape, np.float64)
-
-    # TODO ctx.full(...)
-    init_field(ctx, lex, float(0.0))
-    init_field(ctx, ley, float(0.0))
-    init_field(ctx, lez, float(0.0))
-    init_field(ctx, lhx, float(0.0))
-    init_field(ctx, lhy, float(0.0))
-    init_field(ctx, lhz, float(0.0))
-    init_field(ctx, lepsilon, float(epsilon0))
-    init_field(ctx, lmu, float(mu0))
+
+    # Electric field components (initialized to zero)
+    lex = ctx.logical_data_zeros(shape, dtype=np.float64)
+    ley = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lez = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Magnetic field components (initialized to zero)
+    lhx = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhy = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhz = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Material properties
+    lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64)
+    lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64)
 
     # CFL (same formula as example)
     dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)

From 0514f29c7c00fe1b8e08f0273584d16a3f8f455a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 10 Sep 2025 09:44:19 +0200
Subject: [PATCH 177/221] short names for torch.cuda

---
 .../experimental/stf/_adapters/numba_utils.py | 18 +++----
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 17 +++----
 python/cuda_cccl/tests/stf/test_numba.py      | 47 +++++++++++++++++++
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
index cfa8b27a041..280d8f3a55d 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
@@ -1,15 +1,14 @@
 """
 Utilities for NUMBA-based STF operations.
 """
-import numba
+
 from numba import cuda
-import numpy as np
 
 
 def init_logical_data(ctx, ld, value, data_place=None, exec_place=None):
     """
     Initialize a logical data with a constant value using CuPy's optimized fill.
-    
+
     Parameters
     ----------
     ctx : context
@@ -25,21 +24,22 @@ def init_logical_data(ctx, ld, value, data_place=None, exec_place=None):
     """
     # Create write dependency with optional data place
     dep_arg = ld.write(data_place) if data_place else ld.write()
-    
+
     # Create task arguments - include exec_place if provided
     task_args = []
     if exec_place is not None:
         task_args.append(exec_place)
     task_args.append(dep_arg)
-    
+
     with ctx.task(*task_args) as t:
         # Get the array as a numba device array
         nb_stream = cuda.external_stream(t.stream_ptr())
         array = t.numba_arguments()
-        
+
         try:
             # Use CuPy's optimized operations (much faster than custom kernels)
             import cupy as cp
+
             with cp.cuda.Stream(nb_stream):
                 cp_view = cp.asarray(array)
                 if value == 0 or value == 0.0:
@@ -74,11 +74,13 @@ def _fill_with_simple_kernel(array, value, stream):
     total_size = array.size
     threads_per_block = 256
     blocks_per_grid = (total_size + threads_per_block - 1) // threads_per_block
-    
+
     if value == 0 or value == 0.0:
         # Use the specialized zero kernel for potentially better performance
         _zero_kernel_fallback[blocks_per_grid, threads_per_block, stream](array)
     else:
         # Use generic fill kernel for non-zero values
         typed_value = array.dtype.type(value)
-        _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream](array, typed_value)
\ No newline at end of file
+        _fill_kernel_fallback[blocks_per_grid, threads_per_block, stream](
+            array, typed_value
+        )
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 7c3ca2a8215..2c233eefd76 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -4,6 +4,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+import torch.cuda as tc
 
 from cuda.cccl.experimental.stf._stf_bindings import (
     context,
@@ -108,7 +109,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
         with (
             ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             ex, hy, hz, epsilon = t.tensor_arguments()
             ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
@@ -121,7 +122,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
         with (
             ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             ey, hx, hz, epsilon = t.tensor_arguments()
             ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
@@ -134,7 +135,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
         with (
             ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             ez, hx, hy, epsilon = t.tensor_arguments()
             ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
@@ -147,7 +148,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # source at center cell
         with (
             ctx.task(lez.rw()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             ez = t.tensor_arguments()
             ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
@@ -157,7 +158,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
         with (
             ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             hx, ey, ez, mu = t.tensor_arguments()
             hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
@@ -170,7 +171,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
         with (
             ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             hy, ex, ez, mu = t.tensor_arguments()
             hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
@@ -183,7 +184,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
         with (
             ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t,
-            torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
         ):
             hz, ex, ey, mu = t.tensor_arguments()
             hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
@@ -196,7 +197,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
         if output_freq > 0 and (n % output_freq) == 0:
             with (
                 ctx.task(lez.read()) as t,
-                torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+                tc.stream(tc.ExternalStream(t.stream_ptr())),
             ):
                 ez = t.tensor_arguments()
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 5e9c83153a0..72b9609276f 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -47,6 +47,13 @@ def test_numba_graph():
 
     ctx.finalize()
 
+    # Verify results after finalize (data written back to host)
+    # Expected: scale(2.0, 1.0) = 2.0
+    if np.allclose(X, 2.0):
+        print("✅ Graph test: X values correct: all 2.0")
+    else:
+        print(f"❌ Graph test: X values incorrect: expected 2.0, got {X[:5]}...")
+
 
 def test_numba():
     n = 1024 * 1024
@@ -84,6 +91,46 @@ def test_numba():
 
     ctx.finalize()
 
+    # Verify results after finalize (data written back to host)
+    print("Verifying results after finalize:")
+
+    # Expected values:
+    # X: scale(2.0, 1.0) = 2.0
+    # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0
+    # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0
+    expected_X = 2.0
+    expected_Y = 5.0
+    expected_Z = 15.0
+
+    # Check X values
+    if np.allclose(X, expected_X, rtol=1e-6, atol=1e-6):
+        print(f"✅ X values correct: all {expected_X}")
+    else:
+        actual_x = X[0] if len(X) > 0 else "N/A"
+        print(
+            f"❌ X values incorrect: expected {expected_X}, got {actual_x} (diff: {abs(actual_x - expected_X):.2e})"
+        )
+
+    # Check Y values
+    if np.allclose(Y, expected_Y, rtol=1e-6, atol=1e-6):
+        print(f"✅ Y values correct: all {expected_Y}")
+    else:
+        actual_y = Y[0] if len(Y) > 0 else "N/A"
+        print(
+            f"❌ Y values incorrect: expected {expected_Y}, got {actual_y} (diff: {abs(actual_y - expected_Y):.2e})"
+        )
+
+    # Check Z values
+    if np.allclose(Z, expected_Z, rtol=1e-6, atol=1e-6):
+        print(f"✅ Z values correct: all {expected_Z}")
+    else:
+        actual_z = Z[0] if len(Z) > 0 else "N/A"
+        print(
+            f"❌ Z values incorrect: expected {expected_Z}, got {actual_z} (diff: {abs(actual_z - expected_Z):.2e})"
+        )
+
+    print(f"Sample values: X[0]={X[0]}, Y[0]={Y[0]}, Z[0]={Z[0]}")
+
 
 @cuda.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):

From 5e9b4d597d0504b7a97a158ec9ab72d7a72126ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 10 Sep 2025 10:41:11 +0200
Subject: [PATCH 178/221] Introduce pytorch_task

---
 .../experimental/stf/_stf_bindings_impl.pyx   |  80 ++++++++
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  |   4 +-
 .../tests/stf/test_fdtd_pytorch_simplified.py | 186 ++++++++++++++++++
 python/cuda_cccl/tests/stf/test_pytorch.py    |  69 +++++++
 4 files changed, 337 insertions(+), 2 deletions(-)
 create mode 100644 python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 084ea6476bd..2d7451f0985 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -473,6 +473,53 @@ cdef class task:
         self.end()
         return False
 
+cdef class pytorch_task_context:
+    """
+    Context manager for PyTorch-integrated STF tasks.
+
+    This class automatically handles:
+    - Task start/end
+    - PyTorch stream context
+    - Tensor argument conversion and unpacking
+    """
+    cdef task _task
+    cdef object _torch_stream_context
+
+    def __cinit__(self, task t):
+        self._task = t
+        self._torch_stream_context = None
+
+    def __enter__(self):
+        # Import torch here since we know it's available (checked in pytorch_task)
+        import torch.cuda as tc
+
+        # Start the underlying task
+        self._task.start()
+
+        # Create torch stream context from task stream
+        torch_stream = tc.ExternalStream(self._task.stream_ptr())
+        self._torch_stream_context = tc.stream(torch_stream)
+        self._torch_stream_context.__enter__()
+
+        # Get tensor arguments and return them
+        tensors = self._task.tensor_arguments()
+
+        # If only one tensor, return it directly; otherwise return tuple
+        if isinstance(tensors, tuple):
+            return tensors
+        else:
+            return (tensors,)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        try:
+            # Exit torch stream context first
+            if self._torch_stream_context is not None:
+                self._torch_stream_context.__exit__(exc_type, exc_val, exc_tb)
+        finally:
+            # Always end the task
+            self._task.end()
+        return False
+
 cdef class context:
     cdef stf_ctx_handle _ctx
     # Is this a context that we have borrowed ?
@@ -712,3 +759,36 @@ cdef class context:
                     "Arguments must be dependency objects or an exec_place"
                 )
         return t
+
+    def pytorch_task(self, *args):
+        """
+        Create a PyTorch-integrated task that returns tensors directly.
+        Only available if PyTorch is installed.
+
+        This is a convenience method that combines task creation with automatic
+        PyTorch stream management and tensor conversion.
+
+        Example
+        -------
+        >>> with ctx.pytorch_task(read(lX), rw(lY)) as (x_tensor, y_tensor):
+        >>>     # Automatic PyTorch stream context and tensor unpacking
+        >>>     y_tensor[:] = x_tensor * 2
+
+        Returns
+        -------
+        pytorch_task_context : Context manager that yields tensor arguments
+        """
+        # Check if PyTorch is available
+        try:
+            import torch
+        except ImportError:
+            raise RuntimeError(
+                "pytorch_task requires PyTorch to be installed. "
+                "Install PyTorch or use the regular task() method."
+            )
+
+        # Create the underlying task
+        t = self.task(*args)
+
+        # Return a PyTorch-specific context manager
+        return pytorch_task_context(t)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 2c233eefd76..5712dd74b0a 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -210,6 +210,6 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 
 if __name__ == "__main__":
-    # quick check
+    # Run FDTD simulation
     ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
-#  print("done; Ez(center) =", ez[50, 50, 50].item())
+    print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
new file mode 100644
index 00000000000..a26268f38d2
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -0,0 +1,186 @@
+import math
+from typing import Literal, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from cuda.cccl.experimental.stf._stf_bindings import (
+    context,
+)
+
+Plane = Literal["xy", "xz", "yz"]
+
+
+def show_slice(t3d, plane="xy", index=None):
+    # grab a 2D view
+    if plane == "xy":
+        idx = t3d.shape[2] // 2 if index is None else index
+        slice2d = t3d[:, :, idx]
+    elif plane == "xz":
+        idx = t3d.shape[1] // 2 if index is None else index
+        slice2d = t3d[:, idx, :]
+    elif plane == "yz":
+        idx = t3d.shape[0] // 2 if index is None else index
+        slice2d = t3d[idx, :, :]
+    else:
+        raise ValueError("plane must be 'xy', 'xz' or 'yz'")
+
+    # move to cpu numpy array
+    arr = slice2d.detach().cpu().numpy()
+
+    # imshow = "imshow" not "imread"
+    plt.imshow(
+        arr,
+        origin="lower",
+        cmap="seismic",
+        vmin=-1e-2,
+        vmax=1e-2,
+        #        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
+        #         norm=LogNorm(vmin=1e-12, vmax=1e-6)
+    )
+    # plt.colorbar()
+    plt.show(block=False)
+    plt.pause(0.01)
+
+
+def fdtd_3d_pytorch_simplified(
+    size_x: int = 150,
+    size_y: int = 150,
+    size_z: int = 150,
+    timesteps: int = 10,
+    output_freq: int = 0,
+    dx: float = 0.01,
+    dy: float = 0.01,
+    dz: float = 0.01,
+    epsilon0: float = 8.85e-12,
+    mu0: float = 1.256e-6,
+    device: Optional[torch.device] = None,
+    dtype: torch.dtype = torch.float64,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+    """
+    FDTD 3D implementation using pytorch_task for simplified syntax.
+    Demonstrates automatic stream and tensor management.
+    """
+    ctx = context()
+
+    # allocate and initialize fields
+    shape = (size_x, size_y, size_z)
+
+    # Electric field components (initialized to zero)
+    lex = ctx.logical_data_zeros(shape, dtype=np.float64)
+    ley = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lez = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Magnetic field components (initialized to zero)
+    lhx = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhy = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhz = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Material properties
+    lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64)
+    lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64)
+
+    # CFL (same formula as example)
+    dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
+
+    # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1
+    i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1)
+    i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2)
+
+    # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1
+    i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1)
+    i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None)
+
+    # source location (single cell at center)
+    cx, cy, cz = size_x // 2, size_y // 10, size_z // 2
+
+    def source(t: float, x: float, y: float, z: float) -> float:
+        # sin(k*x - omega*t) with f = 1e9 Hz
+        pi = math.pi
+        freq = 1.0e9
+        omega = 2.0 * pi * freq
+        wavelength = 3.0e8 / freq
+        k = 2.0 * pi / wavelength
+        return math.sin(k * x - omega * t)
+
+    for n in range(int(timesteps)):
+        # -------------------------
+        # update electric fields (Es)
+        # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
+        with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (ex, hy, hz, epsilon):
+            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dx)
+            ) * (
+                (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
+                - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
+            )
+
+        # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
+        with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (ey, hx, hz, epsilon):
+            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dy)
+            ) * (
+                (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
+                - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
+            )
+
+        # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
+        with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (ez, hx, hy, epsilon):
+            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dz)
+            ) * (
+                (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
+                - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
+            )
+
+        # source at center cell
+        with ctx.pytorch_task(lez.rw()) as (ez,):
+            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+
+        # -------------------------
+        # update magnetic fields (Hs)
+        # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
+        with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as (hx, ey, ez, mu):
+            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dy)
+            ) * (
+                (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
+                - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
+            )
+
+        # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
+        with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as (hy, ex, ez, mu):
+            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dz)
+            ) * (
+                (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
+                - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
+            )
+
+        # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
+        with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as (hz, ex, ey, mu):
+            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dx)
+            ) * (
+                (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
+                - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
+            )
+
+        if output_freq > 0 and (n % output_freq) == 0:
+            with ctx.pytorch_task(lez.read()) as (ez,):
+                print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+                show_slice(ez, plane="xy")
+
+    ctx.finalize()
+
+    return ex, ey, ez, hx, hy, hz
+
+
+if __name__ == "__main__":
+    # Run simplified FDTD simulation using pytorch_task
+    print("Running FDTD simulation with pytorch_task syntax...")
+    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)
+    print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 3e2d03ac650..d5fab32d58d 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -95,6 +95,75 @@ def test_pytorch():
     print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}")
 
 
+def test_pytorch_task():
+    """Test the pytorch_task functionality with simplified syntax"""
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    # Equivalent operations to test_pytorch() but using pytorch_task syntax
+
+    # In-place multiplication using pytorch_task (single tensor)
+    with ctx.pytorch_task(rw(lX)) as (tX,):
+        tX[:] = tX * 2
+
+    # Copy and multiply using pytorch_task (multiple tensors)
+    with ctx.pytorch_task(lX.read(), lY.write()) as (tX, tY):
+        tY[:] = tX * 2
+
+    # Another operation combining tensors
+    with ctx.pytorch_task(lX.read(), lZ.write()) as (tX, tZ):
+        tZ[:] = tX * 4 + 1
+
+    # Final operation with read-write access
+    with ctx.pytorch_task(lY.read(), lZ.rw()) as (tY, tZ):
+        tZ[:] = tY * 2 - 3
+
+    ctx.finalize()
+
+    # Verify results on host after finalize (same as original test)
+    print("Verifying pytorch_task results...")
+
+    # Expected values:
+    # X: 1.0 -> 2.0 (multiplied by 2)
+    # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
+    # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
+
+    expected_X = 2.0
+    expected_Y = 4.0
+    expected_Z = 5.0
+
+    # Check a few values to verify correctness
+    assert np.allclose(X[:10], expected_X), (
+        f"X mismatch: got {X[:10]}, expected {expected_X}"
+    )
+    assert np.allclose(Y[:10], expected_Y), (
+        f"Y mismatch: got {Y[:10]}, expected {expected_Y}"
+    )
+    assert np.allclose(Z[:10], expected_Z), (
+        f"Z mismatch: got {Z[:10]}, expected {expected_Z}"
+    )
+
+    # Check entire arrays
+    assert np.all(X == expected_X), (
+        f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}"
+    )
+    assert np.all(Y == expected_Y), (
+        f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}"
+    )
+    assert np.all(Z == expected_Z), (
+        f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}"
+    )
+
+    print(f"✅ All pytorch_task checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}")
+
+
 if __name__ == "__main__":
     print("Running CUDASTF examples...")
     test_pytorch()

From 53a454283972babb894f127298f91c712345caa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 10 Sep 2025 11:37:05 +0200
Subject: [PATCH 179/221] clang-format and some minor comment

---
 .../tests/stf/test_fdtd_pytorch_simplified.py | 42 ++++++++++++++++---
 python/cuda_cccl/tests/stf/test_pytorch.py    |  5 +++
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index a26268f38d2..fa351714b2b 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -110,7 +110,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # -------------------------
         # update electric fields (Es)
         # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
-        with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (ex, hy, hz, epsilon):
+        with ctx.pytorch_task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (
+            ex,
+            hy,
+            hz,
+            epsilon,
+        ):
             ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
                 dt / (epsilon[i_es, j_es, k_es] * dx)
             ) * (
@@ -119,7 +124,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
             )
 
         # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
-        with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (ey, hx, hz, epsilon):
+        with ctx.pytorch_task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (
+            ey,
+            hx,
+            hz,
+            epsilon,
+        ):
             ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
                 dt / (epsilon[i_es, j_es, k_es] * dy)
             ) * (
@@ -128,7 +138,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
             )
 
         # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
-        with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (ez, hx, hy, epsilon):
+        with ctx.pytorch_task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (
+            ez,
+            hx,
+            hy,
+            epsilon,
+        ):
             ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
                 dt / (epsilon[i_es, j_es, k_es] * dz)
             ) * (
@@ -143,7 +158,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
         # -------------------------
         # update magnetic fields (Hs)
         # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
-        with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as (hx, ey, ez, mu):
+        with ctx.pytorch_task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as (
+            hx,
+            ey,
+            ez,
+            mu,
+        ):
             hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
                 dt / (mu[i_hs, j_hs, k_hs] * dy)
             ) * (
@@ -152,7 +172,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
             )
 
         # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
-        with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as (hy, ex, ez, mu):
+        with ctx.pytorch_task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as (
+            hy,
+            ex,
+            ez,
+            mu,
+        ):
             hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
                 dt / (mu[i_hs, j_hs, k_hs] * dz)
             ) * (
@@ -161,7 +186,12 @@ def source(t: float, x: float, y: float, z: float) -> float:
             )
 
         # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
-        with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as (hz, ex, ey, mu):
+        with ctx.pytorch_task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as (
+            hz,
+            ex,
+            ey,
+            mu,
+        ):
             hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
                 dt / (mu[i_hs, j_hs, k_hs] * dx)
             ) * (
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index d5fab32d58d..4c0d180b407 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -103,6 +103,11 @@ def test_pytorch_task():
     Z = np.ones(n, dtype=np.float32)
 
     ctx = context()
+
+    # Note: We could use ctx.logical_data_full instead of creating NumPy arrays first
+    # For example: lX = ctx.logical_data_full((n,), 1.0, dtype=np.float32)
+    # However, this would create logical data without underlying NumPy arrays,
+    # so we wouldn't be able to check results after ctx.finalize() in this test
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)

From 218fda29665c284d9751de23fd78be55768d2975 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 09:13:18 +0200
Subject: [PATCH 180/221] make sure stf python tests are wrapped into functions
 so that pytest calls them

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  |  5 +--
 .../tests/stf/test_fdtd_pytorch_simplified.py |  6 ++--
 python/cuda_cccl/tests/stf/test_fhe.py        | 32 ++++++++++++-------
 .../cuda_cccl/tests/stf/test_fhe_decorator.py | 32 ++++++++++++-------
 .../tests/stf/test_stencil_decorator.py       |  5 +++
 5 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index 5712dd74b0a..db4c9671c16 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -45,7 +45,7 @@ def show_slice(t3d, plane="xy", index=None):
     plt.pause(0.01)
 
 
-def fdtd_3d_pytorch(
+def test_fdtd_3d_pytorch(
     size_x: int = 150,
     size_y: int = 150,
     size_z: int = 150,
@@ -211,5 +211,6 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # Run FDTD simulation
-    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch(timesteps=1000, output_freq=5)
+    print("Running FDTD 3D PyTorch example...")
+    ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch(timesteps=1000, output_freq=5)
     print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index fa351714b2b..5241911a43f 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -44,7 +44,7 @@ def show_slice(t3d, plane="xy", index=None):
     plt.pause(0.01)
 
 
-def fdtd_3d_pytorch_simplified(
+def test_fdtd_3d_pytorch_simplified(
     size_x: int = 150,
     size_y: int = 150,
     size_z: int = 150,
@@ -212,5 +212,7 @@ def source(t: float, x: float, y: float, z: float) -> float:
 if __name__ == "__main__":
     # Run simplified FDTD simulation using pytorch_task
     print("Running FDTD simulation with pytorch_task syntax...")
-    ex, ey, ez, hx, hy, hz = fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)
+    ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch_simplified(
+        timesteps=1000, output_freq=5
+    )
     print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index e2d38308341..7331eeba658 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -142,19 +142,27 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
 
-ctx = cudastf.context(use_graph=False)
+def test_fhe():
+    """Test Fully Homomorphic Encryption (FHE) example with logical operations."""
+    global ctx  # Make ctx accessible to the classes
+    ctx = cudastf.context(use_graph=False)
 
-vA = [3, 3, 2, 2, 17]
-pA = Plaintext(ctx, vA)
-pA.set_symbol("A")
+    vA = [3, 3, 2, 2, 17]
+    pA = Plaintext(ctx, vA)
+    pA.set_symbol("A")
 
-vB = [1, 7, 7, 7, 49]
-pB = Plaintext(ctx, vB)
-pB.set_symbol("B")
+    vB = [1, 7, 7, 7, 49]
+    pB = Plaintext(ctx, vB)
+    pB.set_symbol("B")
 
-eA = pA.encrypt()
-eB = pB.encrypt()
-out = circuit(eA, eB)
+    eA = pA.encrypt()
+    eB = pB.encrypt()
+    out = circuit(eA, eB)
 
-out.decrypt().print_values()
-ctx.finalize()
+    out.decrypt().print_values()
+    ctx.finalize()
+
+
+if __name__ == "__main__":
+    print("Running CUDASTF FHE example...")
+    test_fhe()
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 8adbf5454ed..0bde583bafe 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -124,19 +124,27 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
     return ~((eA | ~eB) & (~eA | eB))
 
 
-ctx = cudastf.context(use_graph=False)
+def test_fhe_decorator():
+    """Test Fully Homomorphic Encryption (FHE) example using @cudastf.jit decorators."""
+    global ctx  # Make ctx accessible to the classes
+    ctx = cudastf.context(use_graph=False)
 
-vA = [3, 3, 2, 2, 17]
-pA = Plaintext(ctx, vA)
-pA.set_symbol("A")
+    vA = [3, 3, 2, 2, 17]
+    pA = Plaintext(ctx, vA)
+    pA.set_symbol("A")
 
-vB = [1, 7, 7, 7, 49]
-pB = Plaintext(ctx, vB)
-pB.set_symbol("B")
+    vB = [1, 7, 7, 7, 49]
+    pB = Plaintext(ctx, vB)
+    pB.set_symbol("B")
 
-eA = pA.encrypt()
-eB = pB.encrypt()
-out = circuit(eA, eB)
+    eA = pA.encrypt()
+    eB = pB.encrypt()
+    out = circuit(eA, eB)
 
-out.decrypt().print_values()
-ctx.finalize()
+    out.decrypt().print_values()
+    ctx.finalize()
+
+
+if __name__ == "__main__":
+    print("Running CUDASTF FHE decorator example...")
+    test_fhe_decorator()
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index 8e52a72f00a..ebfd71de46e 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -79,3 +79,8 @@ def test_numba2d():
     # compare with the GPU result
     max_abs_diff = np.abs(u_out - u_out_ref).max()
     print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
+
+
+if __name__ == "__main__":
+    print("Running CUDASTF stencil decorator example...")
+    test_numba2d()

From 1f974825fe93b1611041d29e3e44e2c449eb4c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 10:07:30 +0200
Subject: [PATCH 181/221] fix the return values of pytests

---
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py            | 5 +----
 python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py | 7 +------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index db4c9671c16..b5209c9d04c 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -206,11 +206,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
     ctx.finalize()
 
-    return ex, ey, ez, hx, hy, hz
-
 
 if __name__ == "__main__":
     # Run FDTD simulation
     print("Running FDTD 3D PyTorch example...")
-    ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch(timesteps=1000, output_freq=5)
-    print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
+    test_fdtd_3d_pytorch(timesteps=1000, output_freq=5)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index 5241911a43f..85f7b856bd4 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -206,13 +206,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
     ctx.finalize()
 
-    return ex, ey, ez, hx, hy, hz
-
 
 if __name__ == "__main__":
     # Run simplified FDTD simulation using pytorch_task
     print("Running FDTD simulation with pytorch_task syntax...")
-    ex, ey, ez, hx, hy, hz = test_fdtd_3d_pytorch_simplified(
-        timesteps=1000, output_freq=5
-    )
-    print(f"Simulation completed; Ez(center) = {ez[75, 15, 75].item():.6e}")
+    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)
\ No newline at end of file

From 7a58d68648bf58c4b12dd28e8cf9095d4573fb0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 13:47:08 +0200
Subject: [PATCH 182/221] Start to experiment with Warp

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 358 ++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/example_fluid_warp.py

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
new file mode 100644
index 00000000000..abd93d9e6e8
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -0,0 +1,358 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################################
+# Example Fluid
+#
+# Shows how to implement a simple 2D Stable Fluids solver using
+# multidimensional arrays and launches.
+#
+###########################################################################
+
+import math
+
+import warp as wp
+import warp.render
+
+import cuda.cccl.experimental.stf as cudastf
+
+def stf_kernel(pyfunc):
+    # let warp decorate normally
+    kernel = wp.kernel(pyfunc)
+
+    # attach an STF-aware call operator
+    def _stf_call(*args, dim=None, stream=None, **kwargs):
+        print(f"[STF TRACE] {pyfunc.__name__}")
+        print(f"  dim={dim}, stream={stream}, args={args}, kwargs={kwargs}")
+        return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs)
+
+    # monkey-patch a method onto the kernel object
+    kernel.stf = _stf_call
+
+    return kernel
+
+def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
+    print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
+    print(f"  dim     = {dim}")
+    print(f"  stream  = {stream}")
+    print(f"  inputs  = {inputs}")
+    print(f"  kwargs  = {kwargs}")
+
+    # just forward to warp for now
+    return wp.launch(
+        kernel,
+        dim=dim,
+        inputs=inputs,
+        stream=stream,
+        **kwargs,
+    )
+
+
+# put it under wp.stf
+if not hasattr(wp, "stf"):
+    class _stf:
+        pass
+    wp.stf = _stf()
+
+
+wp.stf.kernel = stf_kernel
+wp.stf.launch = stf_launch
+
+grid_width = wp.constant(256)
+grid_height = wp.constant(128)
+
+
+@wp.func
+def lookup_float(f: wp.array2d(dtype=float), x: int, y: int):
+    x = wp.clamp(x, 0, grid_width - 1)
+    y = wp.clamp(y, 0, grid_height - 1)
+
+    return f[x, y]
+
+
+@wp.func
+def sample_float(f: wp.array2d(dtype=float), x: float, y: float):
+    lx = int(wp.floor(x))
+    ly = int(wp.floor(y))
+
+    tx = x - float(lx)
+    ty = y - float(ly)
+
+    s0 = wp.lerp(lookup_float(f, lx, ly), lookup_float(f, lx + 1, ly), tx)
+    s1 = wp.lerp(lookup_float(f, lx, ly + 1), lookup_float(f, lx + 1, ly + 1), tx)
+
+    s = wp.lerp(s0, s1, ty)
+    return s
+
+
+@wp.func
+def lookup_vel(f: wp.array2d(dtype=wp.vec2), x: int, y: int):
+    if x < 0 or x >= grid_width:
+        return wp.vec2()
+    if y < 0 or y >= grid_height:
+        return wp.vec2()
+
+    return f[x, y]
+
+
+@wp.func
+def sample_vel(f: wp.array2d(dtype=wp.vec2), x: float, y: float):
+    lx = int(wp.floor(x))
+    ly = int(wp.floor(y))
+
+    tx = x - float(lx)
+    ty = y - float(ly)
+
+    s0 = wp.lerp(lookup_vel(f, lx, ly), lookup_vel(f, lx + 1, ly), tx)
+    s1 = wp.lerp(lookup_vel(f, lx, ly + 1), lookup_vel(f, lx + 1, ly + 1), tx)
+
+    s = wp.lerp(s0, s1, ty)
+    return s
+
+
+@wp.stf.kernel
+def advect(
+    u0: wp.array2d(dtype=wp.vec2),
+    u1: wp.array2d(dtype=wp.vec2),
+    rho0: wp.array2d(dtype=float),
+    rho1: wp.array2d(dtype=float),
+    dt: float,
+):
+    i, j = wp.tid()
+
+    u = u0[i, j]
+
+    # trace backward
+    p = wp.vec2(float(i), float(j))
+    p = p - u * dt
+
+    # advect
+    u1[i, j] = sample_vel(u0, p[0], p[1])
+    rho1[i, j] = sample_float(rho0, p[0], p[1])
+
+
+@wp.stf.kernel
+def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+
+    if i == grid_width - 1:
+        return
+    if j == grid_height - 1:
+        return
+
+    dx = (u[i + 1, j][0] - u[i, j][0]) * 0.5
+    dy = (u[i, j + 1][1] - u[i, j][1]) * 0.5
+
+    div[i, j] = dx + dy
+
+
+@wp.stf.kernel
+def pressure_solve(p0: wp.array2d(dtype=float), p1: wp.array2d(dtype=float), div: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+
+    s1 = lookup_float(p0, i - 1, j)
+    s2 = lookup_float(p0, i + 1, j)
+    s3 = lookup_float(p0, i, j - 1)
+    s4 = lookup_float(p0, i, j + 1)
+
+    # Jacobi update
+    err = s1 + s2 + s3 + s4 - div[i, j]
+
+    p1[i, j] = err * 0.25
+
+
+@wp.stf.kernel
+def pressure_apply(p: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2)):
+    i, j = wp.tid()
+
+    if i == 0 or i == grid_width - 1:
+        return
+    if j == 0 or j == grid_height - 1:
+        return
+
+    # pressure gradient
+    f_p = wp.vec2(p[i + 1, j] - p[i - 1, j], p[i, j + 1] - p[i, j - 1]) * 0.5
+
+    u[i, j] = u[i, j] - f_p
+
+
+@wp.stf.kernel
+def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: float):
+    i, j = wp.tid()
+
+    # gravity
+    f_g = wp.vec2(-90.8, 0.0) * rho[i, j]
+
+    # integrate
+    u[i, j] = u[i, j] + dt * f_g
+
+    # fade
+    rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt)
+
+
+@wp.stf.kernel
+def init(rho: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2), radius: int, dir: wp.vec2):
+    i, j = wp.tid()
+
+    d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2)))
+
+    if d < radius:
+        rho[i, j] = 1.0
+        u[i, j] = dir
+
+
+class Example:
+    def __init__(self):
+        fps = 60
+        self.frame_dt = 1.0 / fps
+        self.sim_substeps = 2
+        self.iterations = 100  # Number of pressure iterations
+        self.sim_dt = self.frame_dt / self.sim_substeps
+        self.sim_time = 0.0
+
+        self._stf_ctx = cudastf.context()
+
+        shape = (grid_width, grid_height)
+
+        self.u0 = wp.zeros(shape, dtype=wp.vec2)
+        self.u1 = wp.zeros(shape, dtype=wp.vec2)
+
+        self.rho0 = wp.zeros(shape, dtype=float)
+        self.rho1 = wp.zeros(shape, dtype=float)
+
+        self.p0 = wp.zeros(shape, dtype=float)
+        self.p1 = wp.zeros(shape, dtype=float)
+        self.div = wp.zeros(shape, dtype=float)
+
+        self.u0._stf_ld = self._stf_ctx.logical_data(self.u0)
+        self.u0._name = "u0"
+
+        self.u1._name = "u1"
+
+        self.rho0._name = "rho0"
+        self.rho1._name = "rho1"
+
+        self.p0._name = "p0"
+        self.p1._name = "p1"
+        self.div._name = "div"
+
+        # capture pressure solve as a CUDA graph
+        self.use_cuda_graph = wp.get_device().is_cuda
+        if self.use_cuda_graph:
+            with wp.ScopedCapture() as capture:
+                self.pressure_iterations()
+            self.graph = capture.graph
+
+    def step(self):
+        with wp.ScopedTimer("step"):
+            for _ in range(self.sim_substeps):
+                shape = (grid_width, grid_height)
+                dt = self.sim_dt
+
+                speed = 400.0
+                angle = math.sin(self.sim_time * 4.0) * 1.5
+                vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed)
+
+                # update emitters
+                wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel])
+
+                # force integrate
+                wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt])
+                wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div])
+
+                # pressure solve
+                self.p0.zero_()
+                self.p1.zero_()
+
+                # if self.use_cuda_graph:
+                #     wp.capture_launch(self.graph)
+                # else:
+                #     self.pressure_iterations()
+                self.pressure_iterations()
+
+                # velocity update
+                wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0])
+
+                # semi-Lagrangian advection
+                wp.stf.launch(advect, dim=shape, inputs=[self.u0, self.u1, self.rho0, self.rho1, dt])
+
+                # swap buffers
+                (self.u0, self.u1) = (self.u1, self.u0)
+                (self.rho0, self.rho1) = (self.rho1, self.rho0)
+
+                self.sim_time += dt
+
+    def pressure_iterations(self):
+        for _ in range(self.iterations):
+            wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div])
+
+            # swap pressure fields
+            (self.p0, self.p1) = (self.p1, self.p0)
+
+    def step_and_render_frame(self, frame_num=None, img=None):
+        self.step()
+
+        with wp.ScopedTimer("render"):
+            if img:
+                img.set_array(self.rho0.numpy())
+
+        return (img,)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
+    parser.add_argument("--num_frames", type=int, default=100000, help="Total number of frames.")
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        help="Run in headless mode, suppressing the opening of any graphical windows.",
+    )
+
+    args = parser.parse_known_args()[0]
+
+    with wp.ScopedDevice(args.device):
+        example = Example()
+
+        if args.headless:
+            for _ in range(args.num_frames):
+                example.step()
+        else:
+            import matplotlib
+            import matplotlib.animation as anim
+            import matplotlib.pyplot as plt
+
+            fig = plt.figure()
+
+            img = plt.imshow(
+                example.rho0.numpy(),
+                origin="lower",
+                animated=True,
+                interpolation="antialiased",
+            )
+            img.set_norm(matplotlib.colors.Normalize(0.0, 1.0))
+            seq = anim.FuncAnimation(
+                fig,
+                example.step_and_render_frame,
+                fargs=(img,),
+                frames=args.num_frames,
+                blit=True,
+                interval=8,
+                repeat=False,
+            )
+
+            plt.show()

From 9fb1c26265e2c8c88cbbdd5b5cd76907fc87a400 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 15:24:23 +0200
Subject: [PATCH 183/221] logical_data in python are now initialized with a
 data place, and they can be initialized from a buffer with a CAI interface to
 support Warp for example

---
 .../stf/include/cccl/c/experimental/stf/stf.h |  67 ++++++-
 c/experimental/stf/src/stf.cu                 |  34 +++-
 .../experimental/stf/_stf_bindings_impl.pyx   | 116 +++++++++++--
 python/cuda_cccl/pyproject.toml               |   2 +-
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 164 ++++++++++++++++--
 .../tests/stf/test_fdtd_pytorch_simplified.py |   2 +-
 6 files changed, 347 insertions(+), 38 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 903b71cd878..49ae71098af 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -456,36 +456,87 @@ cudaStream_t stf_fence(stf_ctx_handle ctx);
 //!
 //! \brief Create logical data from existing memory buffer
 //!
-//! Creates logical data handle from an existing host memory buffer.
-//! STF takes ownership of data management during task execution.
+//! Creates logical data handle from existing memory buffer, assuming host data place.
+//! This is a convenience wrapper around stf_logical_data_with_place() with host placement.
 //!
 //! \param ctx Context handle
 //! \param[out] ld Pointer to receive logical data handle
-//! \param addr Pointer to existing data buffer
+//! \param addr Pointer to existing data buffer (assumed to be host memory)
 //! \param sz Size of data in bytes
 //!
 //! \pre ctx must be valid context handle
 //! \pre ld must not be NULL
-//! \pre addr must not be NULL
+//! \pre addr must not be NULL and point to host-accessible memory
 //! \pre sz must be greater than 0
 //! \post *ld contains valid logical data handle
 //!
-//! \note Original data pointer should not be accessed during task execution
-//! \note Data will be written back when logical data is destroyed or context finalized
+//! \note This function assumes host memory. For device/managed memory, use stf_logical_data_with_place()
+//! \note Equivalent to: stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place())
 //!
 //! \par Example:
 //! \code
 //! float data[1024];
 //! stf_logical_data_handle ld;
-//! stf_logical_data(ctx, &ld, data, sizeof(data));
+//! stf_logical_data(ctx, &ld, data, sizeof(data));  // Assumes host memory
 //! // ... use in tasks ...
 //! stf_logical_data_destroy(ld);
 //! \endcode
 //!
-//! \see stf_logical_data_empty(), stf_logical_data_destroy()
+//! \see stf_logical_data_with_place(), stf_logical_data_empty(), stf_logical_data_destroy()
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 
+//!
+//! \brief Create logical data handle from address with data place specification [PRIMARY API]
+//!
+//! Creates logical data handle from existing memory buffer, explicitly specifying where
+//! the memory is located (host, device, managed, etc.). This is the primary and recommended
+//! logical data creation function as it provides STF with essential memory location information
+//! for optimal data movement and placement strategies.
+//!
+//! \param ctx Context handle
+//! \param[out] ld Pointer to receive logical data handle
+//! \param addr Pointer to existing memory buffer
+//! \param sz Size of buffer in bytes
+//! \param dplace Data place specifying memory location
+//!
+//! \pre ctx must be valid context handle
+//! \pre ld must be valid pointer to logical data handle pointer
+//! \pre addr must point to valid memory of at least sz bytes
+//! \pre sz must be greater than 0
+//! \pre dplace must be valid data place (not invalid)
+//!
+//! \post *ld contains valid logical data handle on success
+//! \post Caller owns returned handle (must call stf_logical_data_destroy())
+//!
+//! \par Examples:
+//! \code
+//! // GPU device memory (recommended for CUDA arrays)
+//! float* device_ptr;
+//! cudaMalloc(&device_ptr, 1000 * sizeof(float));
+//! stf_data_place dplace = make_device_data_place(0);
+//! stf_logical_data_handle ld;
+//! stf_logical_data_with_place(ctx, &ld, device_ptr, 1000 * sizeof(float), dplace);
+//!
+//! // Host memory
+//! float* host_data = new float[1000];
+//! stf_data_place host_place = make_host_data_place();
+//! stf_logical_data_handle ld_host;
+//! stf_logical_data_with_place(ctx, &ld_host, host_data, 1000 * sizeof(float), host_place);
+//!
+//! // Managed memory
+//! float* managed_ptr;
+//! cudaMallocManaged(&managed_ptr, 1000 * sizeof(float));
+//! stf_data_place managed_place = make_managed_data_place();
+//! stf_logical_data_handle ld_managed;
+//! stf_logical_data_with_place(ctx, &ld_managed, managed_ptr, 1000 * sizeof(float), managed_place);
+//! \endcode
+//!
+//! \see make_device_data_place(), make_host_data_place(), make_managed_data_place()
+
+void stf_logical_data_with_place(
+  stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace);
+
 //!
 //! \brief Set symbolic name for logical data
 //!
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index c08a88b77e1..c601be20e26 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -44,12 +44,44 @@ cudaStream_t stf_fence(stf_ctx_handle ctx)
 }
 
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
+{
+  // Convenience wrapper: assume host memory
+  stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place());
+}
+
+void stf_logical_data_with_place(
+  stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace)
 {
   assert(ctx);
   assert(ld);
 
   auto* context_ptr = static_cast<context*>(ctx);
-  auto ld_typed     = context_ptr->logical_data(make_slice((char*) addr, sz));
+
+  // Convert C data_place to C++ data_place
+  cuda::experimental::stf::data_place cpp_dplace;
+  switch (dplace.kind)
+  {
+    case STF_DATA_PLACE_HOST:
+      cpp_dplace = cuda::experimental::stf::data_place::host();
+      break;
+    case STF_DATA_PLACE_DEVICE:
+      cpp_dplace = cuda::experimental::stf::data_place::device(dplace.u.device.dev_id);
+      break;
+    case STF_DATA_PLACE_MANAGED:
+      cpp_dplace = cuda::experimental::stf::data_place::managed();
+      break;
+    case STF_DATA_PLACE_AFFINE:
+      cpp_dplace = cuda::experimental::stf::data_place::affine();
+      break;
+    default:
+      // Invalid data place - this should not happen with valid input
+      assert(false && "Invalid data_place kind");
+      cpp_dplace = cuda::experimental::stf::data_place::host(); // fallback
+      break;
+  }
+
+  // Create logical data with the specified data place
+  auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz), cpp_dplace);
 
   // Store the logical_data_untyped directly as opaque pointer
   *ld = new logical_data_untyped{ld_typed};
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
index 2d7451f0985..f8551b83da8 100644
--- a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
@@ -114,6 +114,7 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     ctypedef struct stf_logical_data_handle_t
     ctypedef stf_logical_data_handle_t* stf_logical_data_handle
     void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
+    void stf_logical_data_with_place(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace)
     void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
     void stf_logical_data_destroy(stf_logical_data_handle ld)
     void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
@@ -168,8 +169,9 @@ cdef class logical_data:
     cdef tuple  _shape
     cdef int    _ndim
     cdef size_t _len
+    cdef str    _symbol  # Store symbol for display purposes
 
-    def __cinit__(self, context ctx=None, object buf=None, shape=None, dtype=None):
+    def __cinit__(self, context ctx=None, object buf=None, data_place dplace=None, shape=None, dtype=None):
         if ctx is None or buf is None:
             # allow creation via __new__ (eg. in like_empty)
             self._ld = NULL
@@ -178,28 +180,90 @@ cdef class logical_data:
             self._dtype = None
             self._shape = ()
             self._ndim = 0
+            self._symbol = None
             return
 
+        self._ctx = ctx._ctx
+        self._symbol = None  # Initialize symbol
+
+        # Default to host data place if not specified (matches C++ API)
+        if dplace is None:
+            dplace = data_place.host()
+
+        # Try CUDA Array Interface first
+        if hasattr(buf, '__cuda_array_interface__'):
+            cai = buf.__cuda_array_interface__
+
+            # Extract CAI information
+            data_ptr, readonly = cai['data']
+            original_shape = cai['shape']
+            typestr = cai['typestr']
+
+            # Handle vector types automatically (e.g., wp.vec2, wp.vec3)
+            # STF treats these as flat scalar arrays with an additional dimension
+            if typestr.startswith('|V'):  # Vector type (e.g., '|V8' for vec2, '|V12' for vec3)
+                vector_size = int(typestr[2:])  # Extract size from '|V8' -> 8 bytes
+
+                if vector_size == 8:  # vec2 (2 * 4 bytes float32)
+                    self._shape = original_shape + (2,)
+                    self._dtype = np.dtype('<f4')  # float32
+                elif vector_size == 12:  # vec3 (3 * 4 bytes float32)
+                    self._shape = original_shape + (3,)
+                    self._dtype = np.dtype('<f4')  # float32
+                elif vector_size == 16:  # vec4 (4 * 4 bytes float32)
+                    self._shape = original_shape + (4,)
+                    self._dtype = np.dtype('<f4')  # float32
+                else:
+                    # Unknown vector type - treat as original
+                    self._shape = original_shape
+                    self._dtype = np.dtype(typestr)
+
+                print(f"STF: Automatically flattened vector type {typestr} -> {self._dtype} with shape {self._shape}")
+            else:
+                # Regular scalar type
+                self._shape = original_shape
+                self._dtype = np.dtype(typestr)
+
+            self._ndim = len(self._shape)
+
+            # Calculate total size in bytes
+            itemsize = self._dtype.itemsize
+            total_items = 1
+            for dim in self._shape:
+                total_items *= dim
+            self._len = total_items * itemsize
+
+            # Create STF logical data using the new C API with data place specification
+            stf_logical_data_with_place(ctx._ctx, &self._ld, <void*><uintptr_t>data_ptr, self._len, dplace._c_place)
+            return
+
+        # Fallback to Python buffer protocol
         cdef Py_buffer view
         cdef int flags = PyBUF_FORMAT | PyBUF_ND          # request dtype + shape
 
-        self._ctx = ctx._ctx
-
         if PyObject_GetBuffer(buf, &view, flags) != 0:
-            raise ValueError("object doesn’t support the full buffer protocol")
+            raise ValueError("object doesn't support the full buffer protocol or __cuda_array_interface__")
 
         try:
             self._ndim  = view.ndim
             self._len = view.len
             self._shape = tuple(<Py_ssize_t>view.shape[i] for i in range(view.ndim))
             self._dtype = np.dtype(view.format)
-            stf_logical_data(ctx._ctx, &self._ld, view.buf, view.len)
+            # For buffer protocol objects, use the specified data place (defaults to host)
+            stf_logical_data_with_place(ctx._ctx, &self._ld, view.buf, view.len, dplace._c_place)
 
         finally:
             PyBuffer_Release(&view)
 
+
     def set_symbol(self, str name):
         stf_logical_data_set_symbol(self._ld, name.encode())
+        self._symbol = name  # Store locally for retrieval
+
+    @property
+    def symbol(self):
+        """Get the symbol name of this logical data, if set."""
+        return self._symbol
 
     def __dealloc__(self):
         if self._ld != NULL:
@@ -240,6 +304,7 @@ cdef class logical_data:
         out._shape = self._shape
         out._ndim  = self._ndim
         out._len   = self._len
+        out._symbol = None  # New object has no symbol initially
 
         return out
 
@@ -254,6 +319,7 @@ cdef class logical_data:
         out._shape = shape
         out._ndim  = len(shape)
         out._len   = math.prod(shape) * out._dtype.itemsize
+        out._symbol = None  # New object has no symbol initially
         stf_logical_data_empty(ctx._ctx, out._len, &out._ld)
 
         return out
@@ -559,16 +625,46 @@ cdef class context:
                 stf_ctx_finalize(self._ctx)
         self._ctx = NULL
 
-    def logical_data(self, object buf):
+    def logical_data(self, object buf, data_place dplace=None):
         """
-        Create and return a `logical_data` object bound to this context.
+        Create and return a `logical_data` object bound to this context [PRIMARY API].
+
+        This is the primary function for creating logical data from existing buffers.
+        It supports both Python buffer protocol objects and CUDA Array Interface objects,
+        with explicit data_place specification for optimal STF data movement strategies.
 
         Parameters
         ----------
-        buf : any buffer‑supporting Python object
-              (NumPy array, bytes, bytearray, memoryview, …)
+        buf : any buffer‑supporting Python object or __cuda_array_interface__ object
+              (NumPy array, Warp array, CuPy array, bytes, bytearray, memoryview, …)
+        dplace : data_place, optional
+              Specifies where the buffer is located (host, device, managed, affine).
+              Defaults to data_place.host() for backward compatibility.
+              Essential for GPU arrays - use data_place.device() for optimal performance.
+
+        Examples
+        --------
+        >>> # Host memory (explicit - recommended)
+        >>> host_place = data_place.host()
+        >>> ld = ctx.logical_data(numpy_array, host_place)
+        >>>
+        >>> # GPU device memory (recommended for CUDA arrays)
+        >>> device_place = data_place.device(0)
+        >>> ld = ctx.logical_data(warp_array, device_place)
+        >>>
+        >>> # Managed/unified memory
+        >>> managed_place = data_place.managed()
+        >>> ld = ctx.logical_data(unified_array, managed_place)
+        >>>
+        >>> # Backward compatibility (defaults to host)
+        >>> ld = ctx.logical_data(numpy_array)  # Same as specifying host
+
+        Note
+        ----
+        For GPU arrays (Warp, CuPy, etc.), always specify data_place.device()
+        for zero-copy performance and correct memory management.
         """
-        return logical_data(self, buf)
+        return logical_data(self, buf, dplace)
 
 
     def logical_data_empty(self, shape, dtype=None):
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 317606e7ad3..4b88822ab41 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
   "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory",
-  "llvmlite==0.44",         # TODO: remove this once numba-cuda 0.19.2 is released
+  "llvmlite==0.44",                                                                        # TODO: remove this once numba-cuda 0.19.2 is released
 ]
 
 dynamic = ["version"]
diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index abd93d9e6e8..7797d30a2d5 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -28,6 +28,7 @@
 
 import cuda.cccl.experimental.stf as cudastf
 
+
 def stf_kernel(pyfunc):
     # let warp decorate normally
     kernel = wp.kernel(pyfunc)
@@ -35,7 +36,46 @@ def stf_kernel(pyfunc):
     # attach an STF-aware call operator
     def _stf_call(*args, dim=None, stream=None, **kwargs):
         print(f"[STF TRACE] {pyfunc.__name__}")
-        print(f"  dim={dim}, stream={stream}, args={args}, kwargs={kwargs}")
+        print(f"  dim={dim}, stream={stream}")
+
+        # Enhanced arg display with logical data detection
+        if args:
+            print("  args=[")
+            for i, arg in enumerate(args):
+                # Detect if argument is or contains STF logical data
+                is_logical_data = False
+                symbol = None
+
+                # Check if arg is directly STF logical data
+                if hasattr(arg, "__class__") and "logical_data" in str(type(arg)):
+                    is_logical_data = True
+                    if hasattr(arg, "symbol") and arg.symbol:
+                        symbol = arg.symbol
+                # Check if arg has attached STF logical data (Warp array)
+                elif hasattr(arg, "_stf_ld"):
+                    is_logical_data = True
+                    if hasattr(arg._stf_ld, "symbol") and arg._stf_ld.symbol:
+                        symbol = arg._stf_ld.symbol
+                # Fallback to _name for Warp arrays
+                elif hasattr(arg, "_name") and arg._name:
+                    symbol = arg._name
+
+                if is_logical_data:
+                    if symbol:
+                        print(f"    [{i}]: '{symbol}' [logical_data]")
+                    else:
+                        print(f"    [{i}]: logical_data")
+                else:
+                    # Regular arguments (scalars, etc.)
+                    if hasattr(arg, "shape"):  # Array-like but not logical data
+                        print(f"    [{i}]: {type(arg).__name__}")
+                    else:  # Scalar value
+                        print(f"    [{i}]: {arg}")
+            print("  ]")
+        else:
+            print(f"  args={args}")
+
+        print(f"  kwargs={kwargs}")
         return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs)
 
     # monkey-patch a method onto the kernel object
@@ -43,11 +83,49 @@ def _stf_call(*args, dim=None, stream=None, **kwargs):
 
     return kernel
 
+
 def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
     print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
     print(f"  dim     = {dim}")
     print(f"  stream  = {stream}")
-    print(f"  inputs  = {inputs}")
+
+    # Enhanced input display with logical data detection
+    if inputs:
+        print("  inputs  = [")
+        for i, inp in enumerate(inputs):
+            # Detect if input is or contains STF logical data
+            is_logical_data = False
+            symbol = None
+
+            # Check if inp is directly STF logical data
+            if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
+                is_logical_data = True
+                if hasattr(inp, "symbol") and inp.symbol:
+                    symbol = inp.symbol
+            # Check if inp has attached STF logical data (Warp array)
+            elif hasattr(inp, "_stf_ld"):
+                is_logical_data = True
+                if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
+                    symbol = inp._stf_ld.symbol
+            # Fallback to _name for Warp arrays
+            elif hasattr(inp, "_name") and inp._name:
+                symbol = inp._name
+
+            if is_logical_data:
+                if symbol:
+                    print(f"    [{i}]: '{symbol}' [logical_data]")
+                else:
+                    print(f"    [{i}]: logical_data")
+            else:
+                # Regular arguments (scalars, etc.)
+                if hasattr(inp, "shape"):  # Array-like but not logical data
+                    print(f"    [{i}]: {type(inp).__name__}")
+                else:  # Scalar value
+                    print(f"    [{i}]: {inp}")
+        print("  ]")
+    else:
+        print(f"  inputs  = {inputs}")
+
     print(f"  kwargs  = {kwargs}")
 
     # just forward to warp for now
@@ -62,8 +140,10 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
 
 # put it under wp.stf
 if not hasattr(wp, "stf"):
+
     class _stf:
         pass
+
     wp.stf = _stf()
 
 
@@ -159,7 +239,11 @@ def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)):
 
 
 @wp.stf.kernel
-def pressure_solve(p0: wp.array2d(dtype=float), p1: wp.array2d(dtype=float), div: wp.array2d(dtype=float)):
+def pressure_solve(
+    p0: wp.array2d(dtype=float),
+    p1: wp.array2d(dtype=float),
+    div: wp.array2d(dtype=float),
+):
     i, j = wp.tid()
 
     s1 = lookup_float(p0, i - 1, j)
@@ -203,7 +287,12 @@ def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: fl
 
 
 @wp.stf.kernel
-def init(rho: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2), radius: int, dir: wp.vec2):
+def init(
+    rho: wp.array2d(dtype=float),
+    u: wp.array2d(dtype=wp.vec2),
+    radius: int,
+    dir: wp.vec2,
+):
     i, j = wp.tid()
 
     d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2)))
@@ -236,14 +325,43 @@ def __init__(self):
         self.p1 = wp.zeros(shape, dtype=float)
         self.div = wp.zeros(shape, dtype=float)
 
-        self.u0._stf_ld = self._stf_ctx.logical_data(self.u0)
+        # Create STF logical data from Warp arrays with explicit data place
+        # Warp arrays are on GPU device memory, so specify data_place.device()
+
+        # For regular float arrays, specify device data place
+        device_place = cudastf.data_place.device(0)
+
+        self.rho0._stf_ld = self._stf_ctx.logical_data(self.rho0, device_place)
+        self.rho1._stf_ld = self._stf_ctx.logical_data(self.rho1, device_place)
+        self.p0._stf_ld = self._stf_ctx.logical_data(self.p0, device_place)
+        self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place)
+        self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place)
+
+        # vec2 arrays - STF now automatically handles vector type flattening
+        # Store STF logical data consistently with other arrays
+        self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place)
+        self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place)
+        print(
+            "✅ Successfully created vec2 STF logical data (automatically flattened by STF)!"
+        )
+
+        print("✅ All arrays created with explicit data place specification!")
+
+        # Set descriptive symbols for STF logical data (for enhanced tracing)
+        self.rho0._stf_ld.set_symbol("density_current")
+        self.rho1._stf_ld.set_symbol("density_next")
+        self.p0._stf_ld.set_symbol("pressure_current")
+        self.p1._stf_ld.set_symbol("pressure_next")
+        self.div._stf_ld.set_symbol("velocity_divergence")
+        self.u0._stf_ld.set_symbol("velocity_current")
+        self.u1._stf_ld.set_symbol("velocity_next")
+        print("✅ Set descriptive symbols for STF logical data!")
+
+        # Set Warp array names (for Warp tracing)
         self.u0._name = "u0"
-
         self.u1._name = "u1"
-
         self.rho0._name = "rho0"
         self.rho1._name = "rho1"
-
         self.p0._name = "p0"
         self.p1._name = "p1"
         self.div._name = "div"
@@ -276,17 +394,21 @@ def step(self):
                 self.p0.zero_()
                 self.p1.zero_()
 
-                # if self.use_cuda_graph:
-                #     wp.capture_launch(self.graph)
-                # else:
-                #     self.pressure_iterations()
+                # if self.use_cuda_graph:
+                #     wp.capture_launch(self.graph)
+                # else:
+                #     self.pressure_iterations()
                 self.pressure_iterations()
 
                 # velocity update
                 wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0])
 
                 # semi-Lagrangian advection
-                wp.stf.launch(advect, dim=shape, inputs=[self.u0, self.u1, self.rho0, self.rho1, dt])
+                wp.stf.launch(
+                    advect,
+                    dim=shape,
+                    inputs=[self.u0, self.u1, self.rho0, self.rho1, dt],
+                )
 
                 # swap buffers
                 (self.u0, self.u1) = (self.u1, self.u0)
@@ -296,7 +418,9 @@ def step(self):
 
     def pressure_iterations(self):
         for _ in range(self.iterations):
-            wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div])
+            wp.stf.launch(
+                pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]
+            )
 
             # swap pressure fields
             (self.p0, self.p1) = (self.p1, self.p0)
@@ -314,9 +438,15 @@ def step_and_render_frame(self, frame_num=None, img=None):
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
-    parser.add_argument("--num_frames", type=int, default=100000, help="Total number of frames.")
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--device", type=str, default=None, help="Override the default Warp device."
+    )
+    parser.add_argument(
+        "--num_frames", type=int, default=100000, help="Total number of frames."
+    )
     parser.add_argument(
         "--headless",
         action="store_true",
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index 85f7b856bd4..659fc43bfd4 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -210,4 +210,4 @@ def source(t: float, x: float, y: float, z: float) -> float:
 if __name__ == "__main__":
     # Run simplified FDTD simulation using pytorch_task
     print("Running FDTD simulation with pytorch_task syntax...")
-    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)
\ No newline at end of file
+    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)

From 5c1d50e46be335c72636e0b88061d13e2aa61f83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 20:56:54 +0200
Subject: [PATCH 184/221] Save WIP: add access modes

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 217 ++++++++++++++----
 1 file changed, 168 insertions(+), 49 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index 7797d30a2d5..56c7800e1ea 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -89,50 +89,100 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
     print(f"  dim     = {dim}")
     print(f"  stream  = {stream}")
 
-    # Enhanced input display with logical data detection
+    # Process STF dependencies and extract arrays for wp.launch
+    processed_inputs = []
+    stf_dependencies = []
+
+    # Enhanced input display with STF dependency detection
     if inputs:
         print("  inputs  = [")
         for i, inp in enumerate(inputs):
-            # Detect if input is or contains STF logical data
-            is_logical_data = False
-            symbol = None
-
-            # Check if inp is directly STF logical data
-            if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
-                is_logical_data = True
-                if hasattr(inp, "symbol") and inp.symbol:
-                    symbol = inp.symbol
-            # Check if inp has attached STF logical data (Warp array)
-            elif hasattr(inp, "_stf_ld"):
-                is_logical_data = True
-                if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
-                    symbol = inp._stf_ld.symbol
-            # Fallback to _name for Warp arrays
-            elif hasattr(inp, "_name") and inp._name:
-                symbol = inp._name
-
-            if is_logical_data:
+            # Check if input is STF dependency wrapper
+            if isinstance(inp, STFDependency):
+                # Extract STF dependency information
+                stf_dependencies.append({
+                    'index': i,
+                    'array': inp.array,
+                    'mode': inp.mode,
+                    'data_place': inp.data_place
+                })
+                
+                # Get symbol for display (STF deps ALWAYS have _stf_ld)
+                symbol = None
+                if hasattr(inp.array._stf_ld, 'symbol') and inp.array._stf_ld.symbol:
+                    symbol = inp.array._stf_ld.symbol
+                elif hasattr(inp.array, '_name') and inp.array._name:
+                    symbol = inp.array._name
+                
                 if symbol:
-                    print(f"    [{i}]: '{symbol}' [logical_data]")
+                    print(f"    [{i}]: '{symbol}' [{inp.mode}] [stf_dep]")
                 else:
-                    print(f"    [{i}]: logical_data")
+                    print(f"    [{i}]: logical_data [{inp.mode}] [stf_dep]")
+                
+                # Add unwrapped array to processed inputs
+                processed_inputs.append(inp.array)
+                
             else:
-                # Regular arguments (scalars, etc.)
-                if hasattr(inp, "shape"):  # Array-like but not logical data
-                    print(f"    [{i}]: {type(inp).__name__}")
-                else:  # Scalar value
-                    print(f"    [{i}]: {inp}")
+                # Regular input - detect logical data for display
+                is_logical_data = False
+                symbol = None
+
+                # Check if inp is directly STF logical data
+                if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
+                    is_logical_data = True
+                    if hasattr(inp, "symbol") and inp.symbol:
+                        symbol = inp.symbol
+                # Check if inp has attached STF logical data (Warp array)
+                elif hasattr(inp, "_stf_ld"):
+                    is_logical_data = True
+                    if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
+                        symbol = inp._stf_ld.symbol
+                # Fallback to _name for Warp arrays
+                elif hasattr(inp, "_name") and inp._name:
+                    symbol = inp._name
+
+                if is_logical_data:
+                    if symbol:
+                        print(f"    [{i}]: '{symbol}' [logical_data]")
+                    else:
+                        print(f"    [{i}]: logical_data")
+                else:
+                    # Regular arguments (scalars, etc.)
+                    if hasattr(inp, "shape"):  # Array-like but not logical data
+                        print(f"    [{i}]: {type(inp).__name__}")
+                    else:  # Scalar value
+                        print(f"    [{i}]: {inp}")
+                        
+                processed_inputs.append(inp)
+                
         print("  ]")
     else:
         print(f"  inputs  = {inputs}")
 
+    # Show STF dependency summary
+    if stf_dependencies:
+        print("  stf_deps = [")
+        for dep in stf_dependencies:
+            # All STF dependencies are guaranteed to have _stf_ld
+            symbol = None
+            if hasattr(dep['array']._stf_ld, 'symbol') and dep['array']._stf_ld.symbol:
+                symbol = dep['array']._stf_ld.symbol
+            elif hasattr(dep['array'], '_name') and dep['array']._name:
+                symbol = dep['array']._name
+                
+            if symbol:
+                print(f"    {dep['mode'].upper()}: '{symbol}'")
+            else:
+                print(f"    {dep['mode'].upper()}: logical_data")
+        print("  ]")
+
     print(f"  kwargs  = {kwargs}")
 
-    # just forward to warp for now
+    # Launch with processed (unwrapped) inputs
     return wp.launch(
         kernel,
         dim=dim,
-        inputs=inputs,
+        inputs=processed_inputs,
         stream=stream,
         **kwargs,
     )
@@ -147,11 +197,63 @@ class _stf:
     wp.stf = _stf()
 
 
+# STF dependency wrapper class  
+class STFDependency:
+    """Wrapper for STF task dependencies with access mode specification."""
+    def __init__(self, array, mode, data_place=None):
+        # CRITICAL: STF dependencies MUST have logical data attached
+        if not hasattr(array, '_stf_ld'):
+            raise ValueError(
+                f"STF dependency requires array with logical data (_stf_ld). "
+                f"Array {type(array).__name__} does not have STF logical data. "
+                f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)"
+            )
+        
+        self.array = array
+        self.mode = mode  # 'read', 'write', 'rw'
+        self.data_place = data_place
+    
+    def __repr__(self):
+        symbol = None
+        if hasattr(self.array._stf_ld, 'symbol'):
+            symbol = self.array._stf_ld.symbol
+        elif hasattr(self.array, '_name'):
+            symbol = self.array._name
+            
+        if symbol:
+            return f"STFDependency('{symbol}', {self.mode})"
+        else:
+            return f"STFDependency({type(self.array).__name__}, {self.mode})"
+
+def stf_read(array, data_place=None):
+    """Mark array as read-only dependency for STF task.
+    
+    REQUIRES: array must have _stf_ld (STF logical data) attached.
+    """
+    return STFDependency(array, 'read', data_place)
+
+def stf_write(array, data_place=None):
+    """Mark array as write-only dependency for STF task.
+    
+    REQUIRES: array must have _stf_ld (STF logical data) attached.
+    """  
+    return STFDependency(array, 'write', data_place)
+
+def stf_rw(array, data_place=None):
+    """Mark array as read-write dependency for STF task.
+    
+    REQUIRES: array must have _stf_ld (STF logical data) attached.
+    """
+    return STFDependency(array, 'rw', data_place)
+
 wp.stf.kernel = stf_kernel
 wp.stf.launch = stf_launch
+wp.stf.read = stf_read
+wp.stf.write = stf_write  
+wp.stf.rw = stf_rw
 
-grid_width = wp.constant(256)
-grid_height = wp.constant(128)
+grid_width = wp.constant(256*4)
+grid_height = wp.constant(128*4)
 
 
 @wp.func
@@ -307,7 +409,7 @@ def __init__(self):
         fps = 60
         self.frame_dt = 1.0 / fps
         self.sim_substeps = 2
-        self.iterations = 100  # Number of pressure iterations
+        self.iterations = 4 #100  # Number of pressure iterations
         self.sim_dt = self.frame_dt / self.sim_substeps
         self.sim_time = 0.0
 
@@ -384,31 +486,46 @@ def step(self):
                 vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed)
 
                 # update emitters
-                wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel])
+                wp.stf.launch(init, dim=shape, inputs=[
+                    wp.stf.write(self.rho0),  # Only writes: rho[i, j] = 1.0
+                    wp.stf.write(self.u0),    # Only writes: u[i, j] = dir
+                    5, vel
+                ])
 
                 # force integrate
-                wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt])
-                wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div])
+                wp.stf.launch(integrate, dim=shape, inputs=[
+                    wp.stf.rw(self.u0),      # Read then write: u[i, j] = u[i, j] + dt * f_g
+                    wp.stf.rw(self.rho0),    # Read then write: rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt)
+                    dt
+                ])
+                wp.stf.launch(divergence, dim=shape, inputs=[
+                    wp.stf.read(self.u0),    # Only reads: u[i + 1, j], u[i, j], etc.
+                    wp.stf.write(self.div)   # Only writes: div[i, j] = dx + dy
+                ])
 
                 # pressure solve
                 self.p0.zero_()
                 self.p1.zero_()
 
-                # if self.use_cuda_graph:
-                #     wp.capture_launch(self.graph)
-                # else:
-                #     self.pressure_iterations()
-                self.pressure_iterations()
+                if self.use_cuda_graph:
+                    wp.capture_launch(self.graph)
+                else:
+                    self.pressure_iterations()
 
                 # velocity update
-                wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0])
+                wp.stf.launch(pressure_apply, dim=shape, inputs=[
+                    wp.stf.read(self.p0),    # Only reads: p[i + 1, j] - p[i - 1, j], etc.
+                    wp.stf.rw(self.u0)       # Read then write: u[i, j] = u[i, j] - f_p
+                ])
 
                 # semi-Lagrangian advection
-                wp.stf.launch(
-                    advect,
-                    dim=shape,
-                    inputs=[self.u0, self.u1, self.rho0, self.rho1, dt],
-                )
+                wp.stf.launch(advect, dim=shape, inputs=[
+                    wp.stf.read(self.u0),    # Only reads: u0[i, j] and sample_vel(u0, ...)
+                    wp.stf.write(self.u1),   # Only writes: u1[i, j] = sample_vel(...)
+                    wp.stf.read(self.rho0),  # Only reads: sample_float(rho0, ...)
+                    wp.stf.write(self.rho1), # Only writes: rho1[i, j] = sample_float(...)
+                    dt
+                ])
 
                 # swap buffers
                 (self.u0, self.u1) = (self.u1, self.u0)
@@ -418,9 +535,11 @@ def step(self):
 
     def pressure_iterations(self):
         for _ in range(self.iterations):
-            wp.stf.launch(
-                pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]
-            )
+            wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[
+                wp.stf.read(self.p0),    # Only reads: lookup_float(p0, ...)
+                wp.stf.write(self.p1),   # Only writes: p1[i, j] = err * 0.25
+                wp.stf.read(self.div)    # Only reads: div[i, j]
+            ])
 
             # swap pressure fields
             (self.p0, self.p1) = (self.p1, self.p0)

From 9f31b1e82500d5a60ea13062fe631f5c42edf02b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 22:19:50 +0200
Subject: [PATCH 185/221] cleanups

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 250 ++++++++++++------
 1 file changed, 170 insertions(+), 80 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index 56c7800e1ea..5b6bd40464d 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -100,28 +100,30 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
             # Check if input is STF dependency wrapper
             if isinstance(inp, STFDependency):
                 # Extract STF dependency information
-                stf_dependencies.append({
-                    'index': i,
-                    'array': inp.array,
-                    'mode': inp.mode,
-                    'data_place': inp.data_place
-                })
-                
+                stf_dependencies.append(
+                    {
+                        "index": i,
+                        "array": inp.array,
+                        "mode": inp.mode,
+                        "data_place": inp.data_place,
+                    }
+                )
+
                 # Get symbol for display (STF deps ALWAYS have _stf_ld)
                 symbol = None
-                if hasattr(inp.array._stf_ld, 'symbol') and inp.array._stf_ld.symbol:
+                if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol:
                     symbol = inp.array._stf_ld.symbol
-                elif hasattr(inp.array, '_name') and inp.array._name:
+                elif hasattr(inp.array, "_name") and inp.array._name:
                     symbol = inp.array._name
-                
+
                 if symbol:
                     print(f"    [{i}]: '{symbol}' [{inp.mode}] [stf_dep]")
                 else:
                     print(f"    [{i}]: logical_data [{inp.mode}] [stf_dep]")
-                
+
                 # Add unwrapped array to processed inputs
                 processed_inputs.append(inp.array)
-                
+
             else:
                 # Regular input - detect logical data for display
                 is_logical_data = False
@@ -152,9 +154,9 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
                         print(f"    [{i}]: {type(inp).__name__}")
                     else:  # Scalar value
                         print(f"    [{i}]: {inp}")
-                        
+
                 processed_inputs.append(inp)
-                
+
         print("  ]")
     else:
         print(f"  inputs  = {inputs}")
@@ -165,11 +167,11 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
         for dep in stf_dependencies:
             # All STF dependencies are guaranteed to have _stf_ld
             symbol = None
-            if hasattr(dep['array']._stf_ld, 'symbol') and dep['array']._stf_ld.symbol:
-                symbol = dep['array']._stf_ld.symbol
-            elif hasattr(dep['array'], '_name') and dep['array']._name:
-                symbol = dep['array']._name
-                
+            if hasattr(dep["array"]._stf_ld, "symbol") and dep["array"]._stf_ld.symbol:
+                symbol = dep["array"]._stf_ld.symbol
+            elif hasattr(dep["array"], "_name") and dep["array"]._name:
+                symbol = dep["array"]._name
+
             if symbol:
                 print(f"    {dep['mode'].upper()}: '{symbol}'")
             else:
@@ -178,14 +180,68 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
 
     print(f"  kwargs  = {kwargs}")
 
-    # Launch with processed (unwrapped) inputs
-    return wp.launch(
-        kernel,
-        dim=dim,
-        inputs=processed_inputs,
-        stream=stream,
-        **kwargs,
-    )
+    # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch
+    if not stf_dependencies:
+        raise ValueError(
+            "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). "
+            f"Found {len(inputs)} inputs but none are STF dependencies. "
+            "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)."
+        )
+
+    # STF Task-based launch with automatic dependency management
+    print("  → Creating STF task with dependencies")
+
+    # Extract the STF context from the first dependency
+    first_dep = stf_dependencies[0]
+    stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle()
+
+    # Create STF dependency objects for the task
+    stf_task_deps = []
+    for dep in stf_dependencies:
+        stf_ld = dep["array"]._stf_ld
+        if dep["mode"] == "read":
+            stf_task_deps.append(stf_ld.read())
+        elif dep["mode"] == "write":
+            stf_task_deps.append(stf_ld.write())
+        elif dep["mode"] == "rw":
+            stf_task_deps.append(stf_ld.rw())
+
+    # Create and execute STF task
+    with stf_ctx.task(*stf_task_deps) as stf_task:
+        # Get raw CUDA stream pointer from STF task
+        stf_stream_ptr = stf_task.stream_ptr()
+
+        print(f"  → STF task stream ptr: {stf_stream_ptr}")
+        print("  → Launching kernel within STF task context")
+
+        # Wrap STF stream via PyTorch ExternalStream -> Warp conversion
+        print(f"  → STF task stream ptr: {stf_stream_ptr}")
+        print("  → Creating PyTorch ExternalStream from STF stream")
+
+        # Import PyTorch for stream conversion
+        import torch
+
+        # Get the current CUDA device for PyTorch
+        warp_device = wp.get_device()
+        device_id = warp_device.ordinal  # Get device number (e.g., 0 for cuda:0)
+        torch_device = torch.device(f"cuda:{device_id}")
+
+        # Create PyTorch ExternalStream from STF stream pointer with explicit device
+        torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device)
+
+        # Convert PyTorch stream to Warp stream
+        warp_stream = wp.stream_from_torch(torch_stream)
+
+        print(f"  → Successfully wrapped STF stream via PyTorch: {warp_stream}")
+
+        # Launch with properly wrapped STF stream
+        return wp.launch(
+            kernel,
+            dim=dim,
+            inputs=processed_inputs,
+            stream=warp_stream,
+            **kwargs,
+        )
 
 
 # put it under wp.stf
@@ -197,63 +253,68 @@ class _stf:
     wp.stf = _stf()
 
 
-# STF dependency wrapper class  
+# STF dependency wrapper class
 class STFDependency:
     """Wrapper for STF task dependencies with access mode specification."""
+
     def __init__(self, array, mode, data_place=None):
         # CRITICAL: STF dependencies MUST have logical data attached
-        if not hasattr(array, '_stf_ld'):
+        if not hasattr(array, "_stf_ld"):
             raise ValueError(
                 f"STF dependency requires array with logical data (_stf_ld). "
                 f"Array {type(array).__name__} does not have STF logical data. "
                 f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)"
             )
-        
+
         self.array = array
         self.mode = mode  # 'read', 'write', 'rw'
         self.data_place = data_place
-    
+
     def __repr__(self):
         symbol = None
-        if hasattr(self.array._stf_ld, 'symbol'):
+        if hasattr(self.array._stf_ld, "symbol"):
             symbol = self.array._stf_ld.symbol
-        elif hasattr(self.array, '_name'):
+        elif hasattr(self.array, "_name"):
             symbol = self.array._name
-            
+
         if symbol:
             return f"STFDependency('{symbol}', {self.mode})"
         else:
             return f"STFDependency({type(self.array).__name__}, {self.mode})"
 
+
 def stf_read(array, data_place=None):
     """Mark array as read-only dependency for STF task.
-    
+
     REQUIRES: array must have _stf_ld (STF logical data) attached.
     """
-    return STFDependency(array, 'read', data_place)
+    return STFDependency(array, "read", data_place)
+
 
 def stf_write(array, data_place=None):
     """Mark array as write-only dependency for STF task.
-    
+
     REQUIRES: array must have _stf_ld (STF logical data) attached.
-    """  
-    return STFDependency(array, 'write', data_place)
+    """
+    return STFDependency(array, "write", data_place)
+
 
 def stf_rw(array, data_place=None):
     """Mark array as read-write dependency for STF task.
-    
+
     REQUIRES: array must have _stf_ld (STF logical data) attached.
     """
-    return STFDependency(array, 'rw', data_place)
+    return STFDependency(array, "rw", data_place)
+
 
 wp.stf.kernel = stf_kernel
 wp.stf.launch = stf_launch
 wp.stf.read = stf_read
-wp.stf.write = stf_write  
+wp.stf.write = stf_write
 wp.stf.rw = stf_rw
 
-grid_width = wp.constant(256*4)
-grid_height = wp.constant(128*4)
+grid_width = wp.constant(256 * 4)
+grid_height = wp.constant(128 * 4)
 
 
 @wp.func
@@ -408,12 +469,17 @@ class Example:
     def __init__(self):
         fps = 60
         self.frame_dt = 1.0 / fps
-        self.sim_substeps = 2
-        self.iterations = 4 #100  # Number of pressure iterations
+        self.sim_substeps = 10
+        self.iterations = 100  # Number of pressure iterations
         self.sim_dt = self.frame_dt / self.sim_substeps
         self.sim_time = 0.0
 
-        self._stf_ctx = cudastf.context()
+        # Create STF context for task-based scheduling
+        # This enables automatic dependency management and stream orchestration
+        import torch
+
+        torch.cuda.init()
+        self._stf_ctx = cudastf.context()  # use_graph=True)
 
         shape = (grid_width, grid_height)
 
@@ -439,8 +505,6 @@ def __init__(self):
         self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place)
         self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place)
 
-        # vec2 arrays - STF now automatically handles vector type flattening
-        # Store STF logical data consistently with other arrays
         self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place)
         self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place)
         print(
@@ -469,7 +533,7 @@ def __init__(self):
         self.div._name = "div"
 
         # capture pressure solve as a CUDA graph
-        self.use_cuda_graph = wp.get_device().is_cuda
+        self.use_cuda_graph = False  # wp.get_device().is_cuda
         if self.use_cuda_graph:
             with wp.ScopedCapture() as capture:
                 self.pressure_iterations()
@@ -486,24 +550,38 @@ def step(self):
                 vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed)
 
                 # update emitters
-                wp.stf.launch(init, dim=shape, inputs=[
-                    wp.stf.write(self.rho0),  # Only writes: rho[i, j] = 1.0
-                    wp.stf.write(self.u0),    # Only writes: u[i, j] = dir
-                    5, vel
-                ])
+                wp.stf.launch(
+                    init,
+                    dim=shape,
+                    inputs=[
+                        wp.stf.write(self.rho0),
+                        wp.stf.write(self.u0),
+                        5,
+                        vel,
+                    ],
+                )
 
                 # force integrate
-                wp.stf.launch(integrate, dim=shape, inputs=[
-                    wp.stf.rw(self.u0),      # Read then write: u[i, j] = u[i, j] + dt * f_g
-                    wp.stf.rw(self.rho0),    # Read then write: rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt)
-                    dt
-                ])
-                wp.stf.launch(divergence, dim=shape, inputs=[
-                    wp.stf.read(self.u0),    # Only reads: u[i + 1, j], u[i, j], etc.
-                    wp.stf.write(self.div)   # Only writes: div[i, j] = dx + dy
-                ])
+                wp.stf.launch(
+                    integrate,
+                    dim=shape,
+                    inputs=[
+                        wp.stf.rw(self.u0),
+                        wp.stf.rw(self.rho0),
+                        dt,
+                    ],
+                )
+                wp.stf.launch(
+                    divergence,
+                    dim=shape,
+                    inputs=[
+                        wp.stf.read(self.u0),
+                        wp.stf.write(self.div),
+                    ],
+                )
 
                 # pressure solve
+                # TODO tasks ?
                 self.p0.zero_()
                 self.p1.zero_()
 
@@ -513,19 +591,27 @@ def step(self):
                     self.pressure_iterations()
 
                 # velocity update
-                wp.stf.launch(pressure_apply, dim=shape, inputs=[
-                    wp.stf.read(self.p0),    # Only reads: p[i + 1, j] - p[i - 1, j], etc.
-                    wp.stf.rw(self.u0)       # Read then write: u[i, j] = u[i, j] - f_p
-                ])
+                wp.stf.launch(
+                    pressure_apply,
+                    dim=shape,
+                    inputs=[
+                        wp.stf.read(self.p0),
+                        wp.stf.rw(self.u0),
+                    ],
+                )
 
                 # semi-Lagrangian advection
-                wp.stf.launch(advect, dim=shape, inputs=[
-                    wp.stf.read(self.u0),    # Only reads: u0[i, j] and sample_vel(u0, ...)
-                    wp.stf.write(self.u1),   # Only writes: u1[i, j] = sample_vel(...)
-                    wp.stf.read(self.rho0),  # Only reads: sample_float(rho0, ...)
-                    wp.stf.write(self.rho1), # Only writes: rho1[i, j] = sample_float(...)
-                    dt
-                ])
+                wp.stf.launch(
+                    advect,
+                    dim=shape,
+                    inputs=[
+                        wp.stf.read(self.u0),
+                        wp.stf.write(self.u1),
+                        wp.stf.read(self.rho0),
+                        wp.stf.write(self.rho1),
+                        dt,
+                    ],
+                )
 
                 # swap buffers
                 (self.u0, self.u1) = (self.u1, self.u0)
@@ -535,11 +621,15 @@ def step(self):
 
     def pressure_iterations(self):
         for _ in range(self.iterations):
-            wp.stf.launch(pressure_solve, dim=self.p0.shape, inputs=[
-                wp.stf.read(self.p0),    # Only reads: lookup_float(p0, ...)
-                wp.stf.write(self.p1),   # Only writes: p1[i, j] = err * 0.25
-                wp.stf.read(self.div)    # Only reads: div[i, j]
-            ])
+            wp.stf.launch(
+                pressure_solve,
+                dim=self.p0.shape,
+                inputs=[
+                    wp.stf.read(self.p0),
+                    wp.stf.write(self.p1),
+                    wp.stf.read(self.div),
+                ],
+            )
 
             # swap pressure fields
             (self.p0, self.p1) = (self.p1, self.p0)

From c0bb0704cc9ce9c25ac8bcf2fa61a416704d6696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 25 Sep 2025 22:34:19 +0200
Subject: [PATCH 186/221] Save WIP

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 175 ++++++++++--------
 1 file changed, 102 insertions(+), 73 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index 5b6bd40464d..d3cb1f4e358 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -85,17 +85,12 @@ def _stf_call(*args, dim=None, stream=None, **kwargs):
 
 
 def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
-    print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
-    print(f"  dim     = {dim}")
-    print(f"  stream  = {stream}")
-
     # Process STF dependencies and extract arrays for wp.launch
     processed_inputs = []
     stf_dependencies = []
 
-    # Enhanced input display with STF dependency detection
+    # Process inputs to separate STF dependencies from regular arguments
     if inputs:
-        print("  inputs  = [")
         for i, inp in enumerate(inputs):
             # Check if input is STF dependency wrapper
             if isinstance(inp, STFDependency):
@@ -108,7 +103,104 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
                         "data_place": inp.data_place,
                     }
                 )
+                # Add unwrapped array to processed inputs
+                processed_inputs.append(inp.array)
+            else:
+                processed_inputs.append(inp)
+
+    # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch
+    if not stf_dependencies:
+        raise ValueError(
+            "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). "
+            f"Found {len(inputs)} inputs but none are STF dependencies. "
+            "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)."
+        )
+
+    # Print tracing information (controlled by STF_TRACE_ENABLED)
+    _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies)
+
+    # Extract the STF context from the first dependency
+    first_dep = stf_dependencies[0]
+    stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle()
+
+    # Create STF dependency objects for the task
+    stf_task_deps = []
+    for dep in stf_dependencies:
+        stf_ld = dep["array"]._stf_ld
+        if dep["mode"] == "read":
+            stf_task_deps.append(stf_ld.read())
+        elif dep["mode"] == "write":
+            stf_task_deps.append(stf_ld.write())
+        elif dep["mode"] == "rw":
+            stf_task_deps.append(stf_ld.rw())
+
+    # Create and execute STF task
+    with stf_ctx.task(*stf_task_deps) as stf_task:
+        # Get raw CUDA stream pointer from STF task
+        stf_stream_ptr = stf_task.stream_ptr()
+
+        # Import PyTorch for stream conversion
+        import torch
+
+        # Get the current CUDA device for PyTorch
+        warp_device = wp.get_device()
+        device_id = warp_device.ordinal  # Get device number (e.g., 0 for cuda:0)
+        torch_device = torch.device(f"cuda:{device_id}")
+
+        # Create PyTorch ExternalStream from STF stream pointer with explicit device
+        torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device)
+
+        # Convert PyTorch stream to Warp stream
+        warp_stream = wp.stream_from_torch(torch_stream)
+
+        # Launch with properly wrapped STF stream
+        return wp.launch(
+            kernel,
+            dim=dim,
+            inputs=processed_inputs,
+            stream=warp_stream,
+            **kwargs,
+        )
+
+
+# STF tracing configuration
+STF_TRACE_ENABLED = True  # Set to False to disable STF tracing
+
+
+def set_stf_trace(enabled: bool):
+    """Enable or disable STF tracing output.
+
+    Args:
+        enabled: True to enable tracing, False to disable
+    """
+    global STF_TRACE_ENABLED
+    STF_TRACE_ENABLED = enabled
+
+
+def get_stf_trace() -> bool:
+    """Get current STF tracing state.
+
+    Returns:
+        True if tracing is enabled, False otherwise
+    """
+    return STF_TRACE_ENABLED
+
 
+def _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies):
+    """Print STF launch tracing information if enabled."""
+    if not STF_TRACE_ENABLED:
+        return
+
+    print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
+    print(f"  dim     = {dim}")
+    print(f"  stream  = {stream}")
+
+    # Enhanced input display with STF dependency detection
+    if inputs:
+        print("  inputs  = [")
+        for i, inp in enumerate(inputs):
+            # Check if input is STF dependency wrapper
+            if isinstance(inp, STFDependency):
                 # Get symbol for display (STF deps ALWAYS have _stf_ld)
                 symbol = None
                 if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol:
@@ -121,9 +213,6 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
                 else:
                     print(f"    [{i}]: logical_data [{inp.mode}] [stf_dep]")
 
-                # Add unwrapped array to processed inputs
-                processed_inputs.append(inp.array)
-
             else:
                 # Regular input - detect logical data for display
                 is_logical_data = False
@@ -155,8 +244,6 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
                     else:  # Scalar value
                         print(f"    [{i}]: {inp}")
 
-                processed_inputs.append(inp)
-
         print("  ]")
     else:
         print(f"  inputs  = {inputs}")
@@ -179,70 +266,8 @@ def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
         print("  ]")
 
     print(f"  kwargs  = {kwargs}")
-
-    # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch
-    if not stf_dependencies:
-        raise ValueError(
-            "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). "
-            f"Found {len(inputs)} inputs but none are STF dependencies. "
-            "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)."
-        )
-
-    # STF Task-based launch with automatic dependency management
     print("  → Creating STF task with dependencies")
 
-    # Extract the STF context from the first dependency
-    first_dep = stf_dependencies[0]
-    stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle()
-
-    # Create STF dependency objects for the task
-    stf_task_deps = []
-    for dep in stf_dependencies:
-        stf_ld = dep["array"]._stf_ld
-        if dep["mode"] == "read":
-            stf_task_deps.append(stf_ld.read())
-        elif dep["mode"] == "write":
-            stf_task_deps.append(stf_ld.write())
-        elif dep["mode"] == "rw":
-            stf_task_deps.append(stf_ld.rw())
-
-    # Create and execute STF task
-    with stf_ctx.task(*stf_task_deps) as stf_task:
-        # Get raw CUDA stream pointer from STF task
-        stf_stream_ptr = stf_task.stream_ptr()
-
-        print(f"  → STF task stream ptr: {stf_stream_ptr}")
-        print("  → Launching kernel within STF task context")
-
-        # Wrap STF stream via PyTorch ExternalStream -> Warp conversion
-        print(f"  → STF task stream ptr: {stf_stream_ptr}")
-        print("  → Creating PyTorch ExternalStream from STF stream")
-
-        # Import PyTorch for stream conversion
-        import torch
-
-        # Get the current CUDA device for PyTorch
-        warp_device = wp.get_device()
-        device_id = warp_device.ordinal  # Get device number (e.g., 0 for cuda:0)
-        torch_device = torch.device(f"cuda:{device_id}")
-
-        # Create PyTorch ExternalStream from STF stream pointer with explicit device
-        torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device)
-
-        # Convert PyTorch stream to Warp stream
-        warp_stream = wp.stream_from_torch(torch_stream)
-
-        print(f"  → Successfully wrapped STF stream via PyTorch: {warp_stream}")
-
-        # Launch with properly wrapped STF stream
-        return wp.launch(
-            kernel,
-            dim=dim,
-            inputs=processed_inputs,
-            stream=warp_stream,
-            **kwargs,
-        )
-
 
 # put it under wp.stf
 if not hasattr(wp, "stf"):
@@ -313,6 +338,10 @@ def stf_rw(array, data_place=None):
 wp.stf.write = stf_write
 wp.stf.rw = stf_rw
 
+# STF tracing control functions
+wp.stf.set_trace = set_stf_trace
+wp.stf.get_trace = get_stf_trace
+
 grid_width = wp.constant(256 * 4)
 grid_height = wp.constant(128 * 4)
 

From 76d78b48db4642e829502798ca46159e1f2b6252 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 08:41:25 +0200
Subject: [PATCH 187/221] Adopt to new python hierarchy

---
 python/cuda_cccl/cuda/{cccl/experimental => }/stf/__init__.py   | 0
 .../cuda/{cccl/experimental => }/stf/_adapters/numba_bridge.py  | 0
 .../cuda/{cccl/experimental => }/stf/_adapters/numba_utils.py   | 0
 .../cuda/{cccl/experimental => }/stf/_adapters/torch_bridge.py  | 0
 .../cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings.py | 0
 .../cuda/{cccl/experimental => }/stf/_stf_bindings_impl.pyx     | 0
 python/cuda_cccl/cuda/{cccl/experimental => }/stf/decorator.py  | 0
 python/cuda_cccl/pyproject.toml                                 | 2 +-
 8 files changed, 1 insertion(+), 1 deletion(-)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/__init__.py (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/numba_bridge.py (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/numba_utils.py (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_adapters/torch_bridge.py (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings.py (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/_stf_bindings_impl.pyx (100%)
 rename python/cuda_cccl/cuda/{cccl/experimental => }/stf/decorator.py (100%)

diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py b/python/cuda_cccl/cuda/stf/__init__.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/__init__.py
rename to python/cuda_cccl/cuda/stf/__init__.py
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/numba_bridge.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_bridge.py
rename to python/cuda_cccl/cuda/stf/_adapters/numba_bridge.py
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py b/python/cuda_cccl/cuda/stf/_adapters/numba_utils.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/numba_utils.py
rename to python/cuda_cccl/cuda/stf/_adapters/numba_utils.py
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/_adapters/torch_bridge.py
rename to python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings.py
rename to python/cuda_cccl/cuda/stf/_stf_bindings.py
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx
rename to python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
diff --git a/python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py
similarity index 100%
rename from python/cuda_cccl/cuda/cccl/experimental/stf/decorator.py
rename to python/cuda_cccl/cuda/stf/decorator.py
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 6fdb0ff854f..14561449098 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
   "numpy",
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
-  "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory",                                                                     # TODO: remove this once numba-cuda 0.19.2 is released
+  "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory",
 ]
 
 dynamic = ["version"]

From 0c11b6a2f839c4ba74d506ed4383e4b5f3d589a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 09:27:19 +0200
Subject: [PATCH 188/221] fix errors in a previous merge

---
 python/cuda_cccl/CMakeLists.txt | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 6d81617cf46..b5c1c511324 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -65,6 +65,7 @@ install(
 file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl")
 file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl")
 
+# Install version-specific binaries
 install(
     TARGETS cccl.c.experimental.stf
     DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl
@@ -117,8 +118,24 @@ set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/compute/_bindings_impl.pyx")
 set(_generated_extension_src "${cuda_cccl_BINARY_DIR}/_bindings_impl.c")
 set(_depfile "${cuda_cccl_BINARY_DIR}/_bindings_impl.c.dep")
 
+# Custom Cython compilation command for version-specific target
+add_custom_command(
+    OUTPUT "${_generated_extension_src}"
+    COMMAND "${Python3_EXECUTABLE}" -m cython
+    ARGS ${CYTHON_FLAGS_LIST} "${pyx_source_file}" --output-file "${_generated_extension_src}"
+    DEPENDS "${pyx_source_file}"
+    DEPFILE "${_depfile}"
+    COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
+)
+
+set_source_files_properties("${_generated_extension_src}" PROPERTIES GENERATED TRUE)
+add_custom_target(cythonize_bindings_impl ALL
+    DEPENDS "${_generated_extension_src}"
+)
+
+
 message(STATUS "STF Using Cython ${CYTHON_VERSION}")
-set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/cccl/experimental/stf/_stf_bindings_impl.pyx")
+set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx")
 set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c")
 set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep")
 add_custom_command(
@@ -134,6 +151,11 @@ add_custom_target(cythonize_stf_bindings_impl ALL
     DEPENDS "${_stf_generated_extension_src}"
 )
 
+Python3_add_library(_bindings_impl MODULE WITH_SOABI "${_generated_extension_src}")
+add_dependencies(_bindings_impl cythonize_bindings_impl)
+target_link_libraries(_bindings_impl PRIVATE cccl.c.parallel CUDA::cuda_driver)
+set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
+
 Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}")
 add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl)
 target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver)

From f6c50e1e743e3943fb7e22ca10a74e0be858a8a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 09:37:00 +0200
Subject: [PATCH 189/221] cuda.cccl.experimental.stf => cuda.stf

---
 python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx           | 6 +++---
 python/cuda_cccl/cuda/stf/decorator.py                     | 2 +-
 python/cuda_cccl/tests/stf/example_fluid_warp.py           | 2 +-
 python/cuda_cccl/tests/stf/test_context.py                 | 2 +-
 python/cuda_cccl/tests/stf/test_decorator.py               | 2 +-
 python/cuda_cccl/tests/stf/test_fdtd_pytorch.py            | 2 +-
 python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py | 2 +-
 python/cuda_cccl/tests/stf/test_fhe.py                     | 2 +-
 python/cuda_cccl/tests/stf/test_fhe_decorator.py           | 2 +-
 python/cuda_cccl/tests/stf/test_numba.py                   | 2 +-
 python/cuda_cccl/tests/stf/test_pytorch.py                 | 2 +-
 python/cuda_cccl/tests/stf/test_stencil_decorator.py       | 2 +-
 12 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index f8551b83da8..099f6869c46 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -502,7 +502,7 @@ cdef class task:
     def get_arg_numba(self, index):
         cai = self.get_arg_cai(index)
         try:
-            from cuda.cccl.experimental.stf._adapters.numba_bridge import cai_to_numba
+            from cuda.stf._adapters.numba_bridge import cai_to_numba
         except Exception as e:
             raise RuntimeError("numba support is not available") from e
         return cai_to_numba(cai)
@@ -516,7 +516,7 @@ cdef class task:
     def get_arg_as_tensor(self, index):
         cai = self.get_arg_cai(index)
         try:
-            from cuda.cccl.experimental.stf._adapters.torch_bridge import cai_to_torch
+            from cuda.stf._adapters.torch_bridge import cai_to_torch
         except Exception as e:
             raise RuntimeError("PyTorch support is not available") from e
         return cai_to_torch(cai)
@@ -755,7 +755,7 @@ cdef class context:
         # Initialize with the specified value using NUMBA
         # The numba code already handles None properly by calling ld.write() without data place
         try:
-            from cuda.cccl.experimental.stf._adapters.numba_utils import init_logical_data
+            from cuda.stf._adapters.numba_utils import init_logical_data
             init_logical_data(self, ld, fill_value, where, exec_place)
         except ImportError as e:
             raise RuntimeError("NUMBA support is not available for logical_data_full") from e
diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py
index c7179d2a6fc..50575229a0e 100644
--- a/python/cuda_cccl/cuda/stf/decorator.py
+++ b/python/cuda_cccl/cuda/stf/decorator.py
@@ -1,7 +1,7 @@
 import numba
 from numba import cuda
 
-from cuda.cccl.experimental.stf import context, dep, exec_place
+from cuda.stf import context, dep, exec_place
 
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 
diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index d3cb1f4e358..66fc1f4bc7c 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -26,7 +26,7 @@
 import warp as wp
 import warp.render
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.stf as cudastf
 
 
 def stf_kernel(pyfunc):
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index b306cf3571f..f4a583de351 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from cuda.cccl.experimental.stf._stf_bindings import context, read, rw
+from cuda.stf._stf_bindings import context, read, rw
 
 
 def test_ctx():
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 49605ced878..7920f9eb83d 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -6,7 +6,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.stf as cudastf
 
 
 @cudastf.jit
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index b5209c9d04c..a64845055ce 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -6,7 +6,7 @@
 import torch
 import torch.cuda as tc
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.stf._stf_bindings import (
     context,
 )
 
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index 659fc43bfd4..24af8361162 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -5,7 +5,7 @@
 import numpy as np
 import torch
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.stf._stf_bindings import (
     context,
 )
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 7331eeba658..fdd2b1e0259 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.stf as cudastf
 
 
 class Plaintext:
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 0bde583bafe..d20b7280c75 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -10,7 +10,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.stf as cudastf
 
 
 class Plaintext:
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 72b9609276f..6d46cdf2829 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -11,7 +11,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (
+from cuda.stf._stf_bindings import (
     context,
     data_place,
     exec_place,
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 4c0d180b407..001a7002d08 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -12,7 +12,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-from cuda.cccl.experimental.stf._stf_bindings import (  # noqa: E402
+from cuda.stf._stf_bindings import (  # noqa: E402
     context,
     rw,
 )
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index ebfd71de46e..16d0ec0e055 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -5,7 +5,7 @@
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.cccl.experimental.stf as cudastf
+import cuda.stf as cudastf
 
 
 @cudastf.jit

From efea184143dfa88f077c6b2778574391a283d1b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 10:02:04 +0200
Subject: [PATCH 190/221] Misc stf python tests improvements

---
 python/cuda_cccl/cuda/stf/decorator.py        |  4 +-
 python/cuda_cccl/tests/stf/test_decorator.py  | 16 +++---
 python/cuda_cccl/tests/stf/test_fhe.py        |  9 ++--
 .../cuda_cccl/tests/stf/test_fhe_decorator.py |  5 +-
 python/cuda_cccl/tests/stf/test_numba.py      | 49 +++++++++----------
 .../tests/stf/test_stencil_decorator.py       |  4 +-
 6 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py
index 50575229a0e..65af9734f44 100644
--- a/python/cuda_cccl/cuda/stf/decorator.py
+++ b/python/cuda_cccl/cuda/stf/decorator.py
@@ -3,7 +3,7 @@
 
 from cuda.stf import context, dep, exec_place
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
 
 
 class stf_kernel_decorator:
@@ -59,7 +59,7 @@ def __call__(self, *args, **kwargs):
         for i, a in enumerate(args):
             # print(f"got one arg {a} is dep ? {isinstance(a, dep)}")
             if isinstance(a, dep):
-                if ctx == None:
+                if ctx is None:
                     ld = a.get_ld()
                     # This context will be used in the __call__ method itself
                     # so we can create a temporary object from the handle
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 7920f9eb83d..16bc2539538 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -3,20 +3,20 @@
 import pytest
 from numba import cuda
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
-numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+import cuda.stf as stf
 
-import cuda.stf as cudastf
+numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
-@cudastf.jit
+@stf.jit
 def axpy(a, x, y):
     i = cuda.grid(1)
     if i < x.size:
         y[i] = a * x[i] + y[i]
 
 
-@cudastf.jit
+@stf.jit
 def scale(a, x):
     i = cuda.grid(1)
     if i < x.size:
@@ -27,18 +27,18 @@ def scale(a, x):
 def test_decorator(use_graph):
     X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
 
-    ctx = cudastf.context(use_graph=use_graph)
+    ctx = stf.context(use_graph=use_graph)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
     scale[32, 64](2.0, lX.rw())
     axpy[32, 64](2.0, lX.read(), lY.rw())
-    axpy[32, 64, cudastf.exec_place.device(0)](
+    axpy[32, 64, stf.exec_place.device(0)](
         2.0, lX.read(), lZ.rw()
     )  # explicit exec place
     axpy[32, 64](
-        2.0, lY.read(), lZ.rw(cudastf.data_place.device(0))
+        2.0, lY.read(), lZ.rw(stf.data_place.device(0))
     )  # per-dep placement override
 
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index fdd2b1e0259..d0bbdd3d596 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -7,11 +7,11 @@
 import numba
 from numba import cuda
 
+import cuda.stf as stf
+
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.stf as cudastf
-
 
 class Plaintext:
     # Initialize from actual values, or from a logical data
@@ -34,9 +34,10 @@ def encrypt(self) -> "Ciphertext":
 
     def print_values(self):
         with ctx.task(
-            cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
+            stf.exec_place.host(), self.l.read(stf.data_place.managed())
         ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
+            nb_stream.synchronize()
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
@@ -145,7 +146,7 @@ def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
 def test_fhe():
     """Test Fully Homomorphic Encryption (FHE) example with logical operations."""
     global ctx  # Make ctx accessible to the classes
-    ctx = cudastf.context(use_graph=False)
+    ctx = stf.context(use_graph=False)
 
     vA = [3, 3, 2, 2, 17]
     pA = Plaintext(ctx, vA)
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index d20b7280c75..571ff8013ea 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -7,11 +7,11 @@
 import numba
 from numba import cuda
 
+import cuda.stf as cudastf
+
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.stf as cudastf
-
 
 class Plaintext:
     # Initialize from actual values, or from a logical data
@@ -37,6 +37,7 @@ def print_values(self):
             cudastf.exec_place.host(), self.l.read(cudastf.data_place.managed())
         ) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
+            nb_stream.synchronize()
             hvalues = t.numba_arguments()
             print([v for v in hvalues])
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index 6d46cdf2829..d15ae639bda 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -8,17 +8,10 @@
 import pytest
 from numba import cuda
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
-numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+import cuda.stf as stf
 
-from cuda.stf._stf_bindings import (
-    context,
-    data_place,
-    exec_place,
-    read,
-    rw,
-    write,
-)
+numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
 @cuda.jit
@@ -38,9 +31,9 @@ def scale(a, x):
 # One test with a single kernel in a CUDA graph
 def test_numba_graph():
     X = np.ones(16, dtype=np.float32)
-    ctx = context(use_graph=True)
+    ctx = stf.context(use_graph=True)
     lX = ctx.logical_data(X)
-    with ctx.task(rw(lX)) as t:
+    with ctx.task(lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.numba_arguments()
         scale[32, 64, nb_stream](2.0, dX)
@@ -61,30 +54,30 @@ def test_numba():
     Y = np.ones(n, dtype=np.float32)
     Z = np.ones(n, dtype=np.float32)
 
-    ctx = context()
+    ctx = stf.context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
-    with ctx.task(rw(lX)) as t:
+    with ctx.task(lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.numba_arguments()
         # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
 
-    with ctx.task(read(lX), rw(lY)) as t:
+    with ctx.task(lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         print(nb_stream)
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
 
-    with ctx.task(read(lX), rw(lZ)) as t:
+    with ctx.task(lX.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX, dZ = t.numba_arguments()
         axpy[32, 64, nb_stream](2.0, dX, dZ)
 
-    with ctx.task(read(lY), rw(lZ)) as t:
+    with ctx.task(lY.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY, dZ = t.numba_arguments()
         axpy[32, 64, nb_stream](2.0, dY, dZ)
@@ -170,11 +163,11 @@ def test_numba2d():
     u = np.sin(x)[:, None] * np.cos(y)[None, :]  # shape = (nx, ny)
     u_out = np.zeros_like(u)
 
-    ctx = context()
+    ctx = stf.context()
     lu = ctx.logical_data(u)
     lu_out = ctx.logical_data(u_out)
 
-    with ctx.task(read(lu), write(lu_out)) as t:
+    with ctx.task(lu.read(), lu_out.write()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         du = t.get_arg_numba(0)
         du_out = t.get_arg_numba(1)
@@ -213,18 +206,18 @@ def test_numba_exec_place():
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)
 
-    ctx = context()
+    ctx = stf.context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
-    with ctx.task(exec_place.device(0), lX.rw()) as t:
+    with ctx.task(stf.exec_place.device(0), lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         # dX = t.get_arg_numba(0)
         dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
         scale[32, 64, nb_stream](2.0, dX)
 
-    with ctx.task(exec_place.device(0), lX.read(), lY.rw()) as t:
+    with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         print(nb_stream)
         dX = t.get_arg_numba(0)
@@ -232,14 +225,16 @@ def test_numba_exec_place():
         axpy[32, 64, nb_stream](2.0, dX, dY)
 
     with ctx.task(
-        exec_place.device(0), lX.read(data_place.managed()), lZ.rw(data_place.managed())
+        stf.exec_place.device(0),
+        lX.read(stf.data_place.managed()),
+        lZ.rw(stf.data_place.managed()),
     ) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
 
-    with ctx.task(exec_place.device(0), lY.read(), lZ.rw()) as t:
+    with ctx.task(stf.exec_place.device(0), lY.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
@@ -255,7 +250,7 @@ def test_numba_places():
     Y = np.ones(16, dtype=np.float32)
     Z = np.ones(16, dtype=np.float32)
 
-    ctx = context()
+    ctx = stf.context()
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
@@ -272,13 +267,13 @@ def test_numba_places():
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
 
-    with ctx.task(exec_place.device(1), lX.read(), lZ.rw()) as t:
+    with ctx.task(stf.exec_place.device(1), lX.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dZ)
 
-    with ctx.task(lY.read(), lZ.rw(data_place.device(1))) as t:
+    with ctx.task(lY.read(), lZ.rw(stf.data_place.device(1))) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY = t.get_arg_numba(0)
         dZ = t.get_arg_numba(1)
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index 16d0ec0e055..b4155c8b46b 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -2,11 +2,11 @@
 import numpy as np
 from numba import cuda
 
+import cuda.stf as cudastf
+
 numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
-import cuda.stf as cudastf
-
 
 @cudastf.jit
 def laplacian_5pt_kernel(u_in, u_out, dx, dy):

From c0d3592426a91df9bba6fae07eafeecdd5b672ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 10:13:35 +0200
Subject: [PATCH 191/221] Save WIP on this warp example

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 344 +++---------------
 1 file changed, 53 insertions(+), 291 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index 66fc1f4bc7c..72c33e860de 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -85,188 +85,57 @@ def _stf_call(*args, dim=None, stream=None, **kwargs):
 
 
 def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
-    # Process STF dependencies and extract arrays for wp.launch
-    processed_inputs = []
-    stf_dependencies = []
-
-    # Process inputs to separate STF dependencies from regular arguments
-    if inputs:
-        for i, inp in enumerate(inputs):
-            # Check if input is STF dependency wrapper
-            if isinstance(inp, STFDependency):
-                # Extract STF dependency information
-                stf_dependencies.append(
-                    {
-                        "index": i,
-                        "array": inp.array,
-                        "mode": inp.mode,
-                        "data_place": inp.data_place,
-                    }
-                )
-                # Add unwrapped array to processed inputs
-                processed_inputs.append(inp.array)
-            else:
-                processed_inputs.append(inp)
-
-    # STF launch REQUIRES STF dependencies - otherwise use regular wp.launch
-    if not stf_dependencies:
-        raise ValueError(
-            "wp.stf.launch() requires STF dependencies (wp.stf.read/write/rw). "
-            f"Found {len(inputs)} inputs but none are STF dependencies. "
-            "Either use regular wp.launch() or wrap arrays with wp.stf.read/write/rw(array)."
-        )
-
-    # Print tracing information (controlled by STF_TRACE_ENABLED)
-    _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies)
-
-    # Extract the STF context from the first dependency
-    first_dep = stf_dependencies[0]
-    stf_ctx = first_dep["array"]._stf_ld.borrow_ctx_handle()
-
-    # Create STF dependency objects for the task
-    stf_task_deps = []
-    for dep in stf_dependencies:
-        stf_ld = dep["array"]._stf_ld
-        if dep["mode"] == "read":
-            stf_task_deps.append(stf_ld.read())
-        elif dep["mode"] == "write":
-            stf_task_deps.append(stf_ld.write())
-        elif dep["mode"] == "rw":
-            stf_task_deps.append(stf_ld.rw())
-
-    # Create and execute STF task
-    with stf_ctx.task(*stf_task_deps) as stf_task:
-        # Get raw CUDA stream pointer from STF task
-        stf_stream_ptr = stf_task.stream_ptr()
-
-        # Import PyTorch for stream conversion
-        import torch
-
-        # Get the current CUDA device for PyTorch
-        warp_device = wp.get_device()
-        device_id = warp_device.ordinal  # Get device number (e.g., 0 for cuda:0)
-        torch_device = torch.device(f"cuda:{device_id}")
-
-        # Create PyTorch ExternalStream from STF stream pointer with explicit device
-        torch_stream = torch.cuda.ExternalStream(stf_stream_ptr, device=torch_device)
-
-        # Convert PyTorch stream to Warp stream
-        warp_stream = wp.stream_from_torch(torch_stream)
-
-        # Launch with properly wrapped STF stream
-        return wp.launch(
-            kernel,
-            dim=dim,
-            inputs=processed_inputs,
-            stream=warp_stream,
-            **kwargs,
-        )
-
-
-# STF tracing configuration
-STF_TRACE_ENABLED = True  # Set to False to disable STF tracing
-
-
-def set_stf_trace(enabled: bool):
-    """Enable or disable STF tracing output.
-
-    Args:
-        enabled: True to enable tracing, False to disable
-    """
-    global STF_TRACE_ENABLED
-    STF_TRACE_ENABLED = enabled
-
-
-def get_stf_trace() -> bool:
-    """Get current STF tracing state.
-
-    Returns:
-        True if tracing is enabled, False otherwise
-    """
-    return STF_TRACE_ENABLED
-
-
-def _trace_stf_launch(kernel, dim, stream, inputs, kwargs, stf_dependencies):
-    """Print STF launch tracing information if enabled."""
-    if not STF_TRACE_ENABLED:
-        return
-
     print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
     print(f"  dim     = {dim}")
     print(f"  stream  = {stream}")
 
-    # Enhanced input display with STF dependency detection
+    # Enhanced input display with logical data detection
     if inputs:
         print("  inputs  = [")
         for i, inp in enumerate(inputs):
-            # Check if input is STF dependency wrapper
-            if isinstance(inp, STFDependency):
-                # Get symbol for display (STF deps ALWAYS have _stf_ld)
-                symbol = None
-                if hasattr(inp.array._stf_ld, "symbol") and inp.array._stf_ld.symbol:
-                    symbol = inp.array._stf_ld.symbol
-                elif hasattr(inp.array, "_name") and inp.array._name:
-                    symbol = inp.array._name
+            # Detect if input is or contains STF logical data
+            is_logical_data = False
+            symbol = None
 
+            # Check if inp is directly STF logical data
+            if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
+                is_logical_data = True
+                if hasattr(inp, "symbol") and inp.symbol:
+                    symbol = inp.symbol
+            # Check if inp has attached STF logical data (Warp array)
+            elif hasattr(inp, "_stf_ld"):
+                is_logical_data = True
+                if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
+                    symbol = inp._stf_ld.symbol
+            # Fallback to _name for Warp arrays
+            elif hasattr(inp, "_name") and inp._name:
+                symbol = inp._name
+
+            if is_logical_data:
                 if symbol:
-                    print(f"    [{i}]: '{symbol}' [{inp.mode}] [stf_dep]")
+                    print(f"    [{i}]: '{symbol}' [logical_data]")
                 else:
-                    print(f"    [{i}]: logical_data [{inp.mode}] [stf_dep]")
-
+                    print(f"    [{i}]: logical_data")
             else:
-                # Regular input - detect logical data for display
-                is_logical_data = False
-                symbol = None
-
-                # Check if inp is directly STF logical data
-                if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
-                    is_logical_data = True
-                    if hasattr(inp, "symbol") and inp.symbol:
-                        symbol = inp.symbol
-                # Check if inp has attached STF logical data (Warp array)
-                elif hasattr(inp, "_stf_ld"):
-                    is_logical_data = True
-                    if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
-                        symbol = inp._stf_ld.symbol
-                # Fallback to _name for Warp arrays
-                elif hasattr(inp, "_name") and inp._name:
-                    symbol = inp._name
-
-                if is_logical_data:
-                    if symbol:
-                        print(f"    [{i}]: '{symbol}' [logical_data]")
-                    else:
-                        print(f"    [{i}]: logical_data")
-                else:
-                    # Regular arguments (scalars, etc.)
-                    if hasattr(inp, "shape"):  # Array-like but not logical data
-                        print(f"    [{i}]: {type(inp).__name__}")
-                    else:  # Scalar value
-                        print(f"    [{i}]: {inp}")
-
+                # Regular arguments (scalars, etc.)
+                if hasattr(inp, "shape"):  # Array-like but not logical data
+                    print(f"    [{i}]: {type(inp).__name__}")
+                else:  # Scalar value
+                    print(f"    [{i}]: {inp}")
         print("  ]")
     else:
         print(f"  inputs  = {inputs}")
 
-    # Show STF dependency summary
-    if stf_dependencies:
-        print("  stf_deps = [")
-        for dep in stf_dependencies:
-            # All STF dependencies are guaranteed to have _stf_ld
-            symbol = None
-            if hasattr(dep["array"]._stf_ld, "symbol") and dep["array"]._stf_ld.symbol:
-                symbol = dep["array"]._stf_ld.symbol
-            elif hasattr(dep["array"], "_name") and dep["array"]._name:
-                symbol = dep["array"]._name
-
-            if symbol:
-                print(f"    {dep['mode'].upper()}: '{symbol}'")
-            else:
-                print(f"    {dep['mode'].upper()}: logical_data")
-        print("  ]")
-
     print(f"  kwargs  = {kwargs}")
-    print("  → Creating STF task with dependencies")
+
+    # just forward to warp for now
+    return wp.launch(
+        kernel,
+        dim=dim,
+        inputs=inputs,
+        stream=stream,
+        **kwargs,
+    )
 
 
 # put it under wp.stf
@@ -278,72 +147,11 @@ class _stf:
     wp.stf = _stf()
 
 
-# STF dependency wrapper class
-class STFDependency:
-    """Wrapper for STF task dependencies with access mode specification."""
-
-    def __init__(self, array, mode, data_place=None):
-        # CRITICAL: STF dependencies MUST have logical data attached
-        if not hasattr(array, "_stf_ld"):
-            raise ValueError(
-                f"STF dependency requires array with logical data (_stf_ld). "
-                f"Array {type(array).__name__} does not have STF logical data. "
-                f"Create logical data first: array._stf_ld = ctx.logical_data(array, data_place)"
-            )
-
-        self.array = array
-        self.mode = mode  # 'read', 'write', 'rw'
-        self.data_place = data_place
-
-    def __repr__(self):
-        symbol = None
-        if hasattr(self.array._stf_ld, "symbol"):
-            symbol = self.array._stf_ld.symbol
-        elif hasattr(self.array, "_name"):
-            symbol = self.array._name
-
-        if symbol:
-            return f"STFDependency('{symbol}', {self.mode})"
-        else:
-            return f"STFDependency({type(self.array).__name__}, {self.mode})"
-
-
-def stf_read(array, data_place=None):
-    """Mark array as read-only dependency for STF task.
-
-    REQUIRES: array must have _stf_ld (STF logical data) attached.
-    """
-    return STFDependency(array, "read", data_place)
-
-
-def stf_write(array, data_place=None):
-    """Mark array as write-only dependency for STF task.
-
-    REQUIRES: array must have _stf_ld (STF logical data) attached.
-    """
-    return STFDependency(array, "write", data_place)
-
-
-def stf_rw(array, data_place=None):
-    """Mark array as read-write dependency for STF task.
-
-    REQUIRES: array must have _stf_ld (STF logical data) attached.
-    """
-    return STFDependency(array, "rw", data_place)
-
-
 wp.stf.kernel = stf_kernel
 wp.stf.launch = stf_launch
-wp.stf.read = stf_read
-wp.stf.write = stf_write
-wp.stf.rw = stf_rw
 
-# STF tracing control functions
-wp.stf.set_trace = set_stf_trace
-wp.stf.get_trace = get_stf_trace
-
-grid_width = wp.constant(256 * 4)
-grid_height = wp.constant(128 * 4)
+grid_width = wp.constant(256)
+grid_height = wp.constant(128)
 
 
 @wp.func
@@ -498,17 +306,12 @@ class Example:
     def __init__(self):
         fps = 60
         self.frame_dt = 1.0 / fps
-        self.sim_substeps = 10
+        self.sim_substeps = 2
         self.iterations = 100  # Number of pressure iterations
         self.sim_dt = self.frame_dt / self.sim_substeps
         self.sim_time = 0.0
 
-        # Create STF context for task-based scheduling
-        # This enables automatic dependency management and stream orchestration
-        import torch
-
-        torch.cuda.init()
-        self._stf_ctx = cudastf.context()  # use_graph=True)
+        self._stf_ctx = cudastf.context()
 
         shape = (grid_width, grid_height)
 
@@ -534,6 +337,8 @@ def __init__(self):
         self.p1._stf_ld = self._stf_ctx.logical_data(self.p1, device_place)
         self.div._stf_ld = self._stf_ctx.logical_data(self.div, device_place)
 
+        # vec2 arrays - STF now automatically handles vector type flattening
+        # Store STF logical data consistently with other arrays
         self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place)
         self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place)
         print(
@@ -562,7 +367,7 @@ def __init__(self):
         self.div._name = "div"
 
         # capture pressure solve as a CUDA graph
-        self.use_cuda_graph = False  # wp.get_device().is_cuda
+        self.use_cuda_graph = wp.get_device().is_cuda
         if self.use_cuda_graph:
             with wp.ScopedCapture() as capture:
                 self.pressure_iterations()
@@ -579,67 +384,30 @@ def step(self):
                 vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed)
 
                 # update emitters
-                wp.stf.launch(
-                    init,
-                    dim=shape,
-                    inputs=[
-                        wp.stf.write(self.rho0),
-                        wp.stf.write(self.u0),
-                        5,
-                        vel,
-                    ],
-                )
+                wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel])
 
                 # force integrate
-                wp.stf.launch(
-                    integrate,
-                    dim=shape,
-                    inputs=[
-                        wp.stf.rw(self.u0),
-                        wp.stf.rw(self.rho0),
-                        dt,
-                    ],
-                )
-                wp.stf.launch(
-                    divergence,
-                    dim=shape,
-                    inputs=[
-                        wp.stf.read(self.u0),
-                        wp.stf.write(self.div),
-                    ],
-                )
+                wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt])
+                wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div])
 
                 # pressure solve
-                # TODO tasks ?
                 self.p0.zero_()
                 self.p1.zero_()
 
-                if self.use_cuda_graph:
-                    wp.capture_launch(self.graph)
-                else:
-                    self.pressure_iterations()
+                # if self.use_cuda_graph:
+                #     wp.capture_launch(self.graph)
+                # else:
+                #     self.pressure_iterations()
+                self.pressure_iterations()
 
                 # velocity update
-                wp.stf.launch(
-                    pressure_apply,
-                    dim=shape,
-                    inputs=[
-                        wp.stf.read(self.p0),
-                        wp.stf.rw(self.u0),
-                    ],
-                )
+                wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0])
 
                 # semi-Lagrangian advection
                 wp.stf.launch(
                     advect,
                     dim=shape,
-                    inputs=[
-                        wp.stf.read(self.u0),
-                        wp.stf.write(self.u1),
-                        wp.stf.read(self.rho0),
-                        wp.stf.write(self.rho1),
-                        dt,
-                    ],
+                    inputs=[self.u0, self.u1, self.rho0, self.rho1, dt],
                 )
 
                 # swap buffers
@@ -651,13 +419,7 @@ def step(self):
     def pressure_iterations(self):
         for _ in range(self.iterations):
             wp.stf.launch(
-                pressure_solve,
-                dim=self.p0.shape,
-                inputs=[
-                    wp.stf.read(self.p0),
-                    wp.stf.write(self.p1),
-                    wp.stf.read(self.div),
-                ],
+                pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]
             )
 
             # swap pressure fields

From eba61eb4d6286efb2b4b04623dc1972f1654df7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 13:39:05 +0200
Subject: [PATCH 192/221] Add sanity checks to test the is_void_interface() API

---
 cudax/examples/stf/void_data_interface.cu | 4 ++++
 cudax/test/stf/cpp/task_get_stream.cu     | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu
index 0340b16bf4a..bf429f23dfe 100644
--- a/cudax/examples/stf/void_data_interface.cu
+++ b/cudax/examples/stf/void_data_interface.cu
@@ -49,5 +49,9 @@ int main()
     return cuda_kernel_desc{dummy_kernel, 16, 128, 0};
   };
 
+  EXPECT(token.is_void_interface());
+  EXPECT(token2.is_void_interface());
+  EXPECT(token3.is_void_interface());
+
   ctx.finalize();
 }
diff --git a/cudax/test/stf/cpp/task_get_stream.cu b/cudax/test/stf/cpp/task_get_stream.cu
index 89fa74e7490..2d6509e5a73 100644
--- a/cudax/test/stf/cpp/task_get_stream.cu
+++ b/cudax/test/stf/cpp/task_get_stream.cu
@@ -24,7 +24,8 @@ void test_stream()
   context ctx;
 
   auto token = ctx.token();
-  auto t     = ctx.task(token.write());
+  EXPECT(token.is_void_interface());
+  auto t = ctx.task(token.write());
   t.start();
   cudaStream_t s = t.get_stream();
   EXPECT(s != nullptr);

From e17c261c582e89d044c6fc97034b808bd9373813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 22:04:36 +0200
Subject: [PATCH 193/221] support tokens in python

---
 .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 59 +++++++++++--
 python/cuda_cccl/tests/stf/test_token.py      | 86 +++++++++++++++++++
 2 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100644 python/cuda_cccl/tests/stf/test_token.py

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index 099f6869c46..00b8dd39ed5 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -119,6 +119,8 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_logical_data_destroy(stf_logical_data_handle ld)
     void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
 
+    void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
+
     ctypedef struct stf_task_handle_t
     ctypedef stf_task_handle_t* stf_task_handle
     void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
@@ -170,6 +172,7 @@ cdef class logical_data:
     cdef int    _ndim
     cdef size_t _len
     cdef str    _symbol  # Store symbol for display purposes
+    cdef readonly bint _is_token  # readonly makes it accessible from Python
 
     def __cinit__(self, context ctx=None, object buf=None, data_place dplace=None, shape=None, dtype=None):
         if ctx is None or buf is None:
@@ -181,10 +184,12 @@ cdef class logical_data:
             self._shape = ()
             self._ndim = 0
             self._symbol = None
+            self._is_token = False
             return
 
         self._ctx = ctx._ctx
         self._symbol = None  # Initialize symbol
+        self._is_token = False  # Initialize token flag
 
         # Default to host data place if not specified (matches C++ API)
         if dplace is None:
@@ -270,6 +275,12 @@ cdef class logical_data:
             stf_logical_data_destroy(self._ld)
             self._ld = NULL
 
+    def __repr__(self):
+        """Return a detailed string representation of the logical_data object."""
+        return (f"logical_data(shape={self._shape}, dtype={self._dtype}, "
+                f"is_token={self._is_token}, symbol={self._symbol!r}, "
+                f"len={self._len}, ndim={self._ndim})")
+
     @property
     def dtype(self):
         """Return the dtype of the logical data."""
@@ -305,6 +316,21 @@ cdef class logical_data:
         out._ndim  = self._ndim
         out._len   = self._len
         out._symbol = None  # New object has no symbol initially
+        out._is_token = False
+
+        return out
+
+    @staticmethod
+    def token(context ctx):
+        cdef logical_data out = logical_data.__new__(logical_data)
+        out._ctx   = ctx._ctx
+        out._dtype = None
+        out._shape = None
+        out._ndim  = 0
+        out._len   = 0
+        out._symbol = None  # New object has no symbol initially
+        out._is_token = True
+        stf_token(ctx._ctx, &out._ld)
 
         return out
 
@@ -320,6 +346,7 @@ cdef class logical_data:
         out._ndim  = len(shape)
         out._len   = math.prod(shape) * out._dtype.itemsize
         out._symbol = None  # New object has no symbol initially
+        out._is_token = False
         stf_logical_data_empty(ctx._ctx, out._len, &out._ld)
 
         return out
@@ -492,6 +519,9 @@ cdef class task:
         return <uintptr_t> s         # cast pointer -> Py int
 
     def get_arg(self, index) -> int:
+        if self._lds_args[index]._is_token:
+           raise RuntimeError("cannot materialize a token argument")
+
         cdef void *ptr = stf_task_get(self._t, index)
         return <uintptr_t>ptr
 
@@ -508,10 +538,15 @@ cdef class task:
         return cai_to_numba(cai)
 
     def numba_arguments(self):
-        arg_cnt=len(self._lds_args)
-        if arg_cnt == 1:
-            return self.get_arg_numba(0)
-        return tuple(self.get_arg_numba(i) for i in range(arg_cnt))
+        # Only include non-token arguments in the tuple
+        non_token_args = [self.get_arg_numba(i) for i in range(len(self._lds_args))
+                          if not self._lds_args[i]._is_token]
+
+        if len(non_token_args) == 0:
+            return None
+        elif len(non_token_args) == 1:
+            return non_token_args[0]
+        return tuple(non_token_args)
 
     def get_arg_as_tensor(self, index):
         cai = self.get_arg_cai(index)
@@ -522,10 +557,15 @@ cdef class task:
         return cai_to_torch(cai)
 
     def tensor_arguments(self):
-        arg_cnt=len(self._lds_args)
-        if arg_cnt == 1:
-            return self.get_arg_as_tensor(0)
-        return tuple(self.get_arg_as_tensor(i) for i in range(arg_cnt))
+        # Only include non-token arguments in the tuple
+        non_token_args = [self.get_arg_as_tensor(i) for i in range(len(self._lds_args))
+                          if not self._lds_args[i]._is_token]
+
+        if len(non_token_args) == 0:
+            return None
+        elif len(non_token_args) == 1:
+            return non_token_args[0]
+        return tuple(non_token_args)
 
     # ---- context‑manager helpers -------------------------------
     def __enter__(self):
@@ -830,6 +870,9 @@ cdef class context:
             dtype = np.float64
         return self.logical_data_full(shape, 1.0, dtype, where, exec_place)
 
+    def token(self):
+        return logical_data.token(self)
+
     def task(self, *args):
         """
         Create a `task`
diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py
new file mode 100644
index 00000000000..acef5e34f3e
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_token.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numba
+import numpy as np
+from numba import cuda
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+def test_token():
+    ctx = stf.context()
+    lX = ctx.token()
+    lY = ctx.token()
+    lZ = ctx.token()
+
+    with ctx.task(lX.rw()):
+        pass
+
+    with ctx.task(lX.read(), lY.rw()):
+        pass
+
+    with ctx.task(lX.read(), lZ.rw()):
+        pass
+
+    with ctx.task(lY.read(), lZ.rw()):
+        pass
+
+    ctx.finalize()
+
+
+@cuda.jit
+def axpy(a, x, y):
+    start = cuda.grid(1)
+    stride = cuda.gridsize(1)
+    for i in range(start, x.size, stride):
+        y[i] = a * x[i] + y[i]
+
+
+def test_numba_token():
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    token = ctx.token()
+
+    # Use a reasonable grid size - kernel loop will handle all elements
+    blocks = 32
+    threads_per_block = 256
+
+    with ctx.task(lX.read(), lY.rw(), token.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        print(nb_stream)
+        dX = t.get_arg_numba(0)
+        dY = t.get_arg_numba(1)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
+
+    with ctx.task(lX.read(), lY.rw(), token.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        print(nb_stream)
+        dX, dY = t.numba_arguments()
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
+
+    ctx.finalize()
+
+    # Sanity checks: verify the results after finalize
+    # First task: Y = 2.0 * X + Y = 2.0 * 1.0 + 1.0 = 3.0
+    # Second task: Y = 2.0 * X + Y = 2.0 * 1.0 + 3.0 = 5.0
+    assert np.allclose(X, 1.0), f"X should still be 1.0 (read-only), but got {X[0]}"
+    assert np.allclose(Y, 5.0), (
+        f"Y should be 5.0 after two axpy operations, but got {Y[0]}"
+    )
+    print(f"✓ X = {X[0]} (expected 1.0)")
+    print(f"✓ Y = {Y[0]} (expected 5.0)")
+
+
+if __name__ == "__main__":
+    print("Running CUDASTF examples...")
+    test_token()

From ec9c9553f68ea70cb363e894baea8e50171dd3e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 23:04:18 +0200
Subject: [PATCH 194/221] remove debug print

---
 python/cuda_cccl/tests/stf/example_fluid_warp.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index 72c33e860de..c1d903b9be7 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -341,13 +341,7 @@ def __init__(self):
         # Store STF logical data consistently with other arrays
         self.u0._stf_ld = self._stf_ctx.logical_data(self.u0, device_place)
         self.u1._stf_ld = self._stf_ctx.logical_data(self.u1, device_place)
-        print(
-            "✅ Successfully created vec2 STF logical data (automatically flattened by STF)!"
-        )
 
-        print("✅ All arrays created with explicit data place specification!")
-
-        # Set descriptive symbols for STF logical data (for enhanced tracing)
         self.rho0._stf_ld.set_symbol("density_current")
         self.rho1._stf_ld.set_symbol("density_next")
         self.p0._stf_ld.set_symbol("pressure_current")
@@ -355,7 +349,6 @@ def __init__(self):
         self.div._stf_ld.set_symbol("velocity_divergence")
         self.u0._stf_ld.set_symbol("velocity_current")
         self.u1._stf_ld.set_symbol("velocity_next")
-        print("✅ Set descriptive symbols for STF logical data!")
 
         # Set Warp array names (for Warp tracing)
         self.u0._name = "u0"

From 52f48230fa8fc5adfed25e8bcfc9179e85324f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 8 Oct 2025 23:51:57 +0200
Subject: [PATCH 195/221] python cholesky with cupy

---
 .../cuda_cccl/tests/stf/example_cholesky.py   | 529 ++++++++++++++++++
 1 file changed, 529 insertions(+)
 create mode 100755 python/cuda_cccl/tests/stf/example_cholesky.py

diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py
new file mode 100755
index 00000000000..d696967c3e7
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_cholesky.py
@@ -0,0 +1,529 @@
+#!/usr/bin/env python3
+"""
+Python implementation of Cholesky decomposition using CUDA STF and CuPy (CUBLAS/CUSOLVER).
+
+This example demonstrates:
+- Tiled matrix operations with STF logical data
+- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks
+- Multi-device execution with automatic data placement
+- Task-based parallelism for linear algebra operations
+
+Note: CUDASTF automatically manages device context within tasks via exec_place.device().
+There's no need to manually set the current device in task bodies - just use the STF stream.
+"""
+
+import sys
+import numpy as np
+import cupy as cp
+from cupyx.scipy import linalg as cp_linalg
+import cuda.stf as stf
+
+
+class TiledMatrix:
+    """
+    Tiled matrix class that splits a matrix into blocks for parallel processing.
+    Each block is managed as an STF logical data object.
+    """
+    
+    def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False, symbol="matrix", dtype=np.float64):
+        """
+        Initialize a tiled matrix.
+        
+        Args:
+            ctx: STF context
+            nrows: Total number of rows
+            ncols: Total number of columns
+            block_rows: Block size (rows)
+            block_cols: Block size (columns)
+            is_symmetric: If True, only stores lower triangular blocks
+            symbol: Name/symbol for the matrix
+            dtype: Data type (default: np.float64)
+        """
+        self.ctx = ctx
+        self.symbol = symbol
+        self.dtype = dtype
+        
+        self.m = nrows
+        self.n = ncols
+        self.mb = block_rows
+        self.nb = block_cols
+        self.sym_matrix = is_symmetric
+        
+        assert self.m % self.mb == 0, f"nrows ({self.m}) must be divisible by block_rows ({self.mb})"
+        assert self.n % self.nb == 0, f"ncols ({self.n}) must be divisible by block_cols ({self.nb})"
+        
+        # Number of blocks
+        self.mt = self.m // self.mb
+        self.nt = self.n // self.nb
+        
+        # Allocate host memory (pinned for faster transfers)
+        self.h_array = cp.cuda.alloc_pinned_memory(self.m * self.n * np.dtype(dtype).itemsize)
+        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(self.m, self.n)
+        
+        # Create logical data handles for each block
+        self.handles = {}
+        
+        # Get available devices for mapping
+        self.ndevs = cp.cuda.runtime.getDeviceCount()
+        self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
+        
+        print(f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}")
+        print(f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid")
+        
+        # Create blocks
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                # Get tile data from host array (using tiled storage)
+                tile_data = self._get_block_h(rowb, colb)
+                
+                # Create CuPy array on the preferred device
+                devid = self.get_preferred_devid(rowb, colb)
+                with cp.cuda.Device(devid):
+                    # Create device array for this block
+                    d_block = cp.asarray(tile_data)
+                    
+                    # Create STF logical data
+                    device_place = stf.data_place.device(devid)
+                    handle = self.ctx.logical_data(d_block, device_place)
+                    handle.set_symbol(f"{symbol}_{rowb}_{colb}")
+                    
+                    self.handles[(rowb, colb)] = (handle, d_block)
+    
+    def _compute_device_grid(self, ndevs):
+        """Compute 2D device grid dimensions (as close to square as possible)"""
+        grid_p = 1
+        grid_q = ndevs
+        for a in range(1, int(np.sqrt(ndevs)) + 1):
+            if ndevs % a == 0:
+                grid_p = a
+                grid_q = ndevs // a
+        return grid_p, grid_q
+    
+    def get_preferred_devid(self, row, col):
+        """Get preferred device ID for a given block using cyclic distribution"""
+        return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
+    
+    def handle(self, row, col):
+        """Get the logical data handle for block (row, col)"""
+        return self.handles[(row, col)][0]
+    
+    def device_array(self, row, col):
+        """Get the CuPy device array for block (row, col)"""
+        return self.handles[(row, col)][1]
+    
+    def _get_index(self, row, col):
+        """Convert (row, col) to linear index in tiled storage"""
+        # Find which tile contains this element
+        tile_row = row // self.mb
+        tile_col = col // self.nb
+        
+        tile_size = self.mb * self.nb
+        
+        # Index of the beginning of the tile
+        tile_start = (tile_row + self.mt * tile_col) * tile_size
+        
+        # Offset within the tile
+        offset = (row % self.mb) + (col % self.nb) * self.mb
+        
+        return tile_start + offset
+    
+    def _get_block_h(self, brow, bcol):
+        """Get a view of the host data for block (brow, bcol)"""
+        # For tiled storage, blocks are stored contiguously
+        start_idx = (brow + self.mt * bcol) * self.mb * self.nb
+        end_idx = start_idx + self.mb * self.nb
+        flat_view = self.h_array_np.ravel()
+        return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
+    
+    def fill(self, func):
+        """Fill matrix using a function func(row, col) -> value"""
+        print(f"[{self.symbol}] Filling matrix...")
+        
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                devid = self.get_preferred_devid(rowb, colb)
+                handle = self.handle(rowb, colb)
+                d_array = self.device_array(rowb, colb)
+                
+                # Fill on host then copy to device
+                h_block = self._get_block_h(rowb, colb)
+                for lrow in range(self.mb):
+                    for lcol in range(self.nb):
+                        row = lrow + rowb * self.mb
+                        col = lcol + colb * self.nb
+                        h_block[lrow, lcol] = func(row, col)
+                
+                # Copy to device
+                with cp.cuda.Device(devid):
+                    cp.copyto(d_array, cp.asarray(h_block))
+    
+    def finalize(self):
+        """Copy all blocks back to host memory"""
+        print(f"[{self.symbol}] Finalizing (copying back to host)...")
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                devid = self.get_preferred_devid(rowb, colb)
+                d_array = self.device_array(rowb, colb)
+                h_block = self._get_block_h(rowb, colb)
+                
+                with cp.cuda.Device(devid):
+                    cp.copyto(h_block, cp.asnumpy(d_array))
+
+
+# BLAS/LAPACK operations wrapped in STF tasks
+
+def DPOTRF(ctx, A, row, col):
+    """Cholesky factorization of block (row, col) using CUSOLVER"""
+    handle = A.handle(row, col)
+    d_block = A.device_array(row, col)
+    devid = A.get_preferred_devid(row, col)
+    
+    with ctx.task(stf.exec_place.device(devid), handle.rw()) as t:
+        # STF automatically sets the current device, just use the stream
+        stream_ptr = t.stream_ptr()
+        cp_stream = cp.cuda.ExternalStream(stream_ptr)
+        
+        with cp_stream:
+            # Perform Cholesky factorization (lower triangular) - IN PLACE
+            # CuPy's cholesky returns L where A = L @ L.T
+            d_block[:] = cp.linalg.cholesky(d_block)
+
+def DTRSM(ctx, A, a_row, a_col, B, b_row, b_col, side='L', uplo='L', transa='T', diag='N', alpha=1.0):
+    """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}"""
+    handle_a = A.handle(a_row, a_col)
+    handle_b = B.handle(b_row, b_col)
+    d_a = A.device_array(a_row, a_col)
+    d_b = B.device_array(b_row, b_col)
+    devid = B.get_preferred_devid(b_row, b_col)
+    
+    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t:
+        # STF automatically sets the current device
+        stream_ptr = t.stream_ptr()
+        cp_stream = cp.cuda.ExternalStream(stream_ptr)
+        
+        with cp_stream:
+            # Triangular solve using CuPy
+            # For side='L': solve op(A) @ X = B for X, then X = alpha * X
+            if side == 'L':
+                if transa == 'N':
+                    # Solve A @ X = B where A is lower/upper triangular
+                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == 'L'))
+                else:
+                    # Solve A^T @ X = B where A is lower triangular
+                    # This is equivalent to solving U @ X = B where U = A^T is upper
+                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != 'L'))
+                if alpha != 1.0:
+                    d_b *= alpha
+            else:
+                # For side='R': solve X @ op(A) = B
+                # Rewrite as op(A)^T @ X^T = B^T
+                if transa == 'N':
+                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b.T, lower=(uplo != 'L')).T
+                else:
+                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b.T, lower=(uplo == 'L')).T
+                if alpha != 1.0:
+                    d_b *= alpha
+
+def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col, 
+          transa='N', transb='N', alpha=1.0, beta=1.0):
+    """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C"""
+    handle_a = A.handle(a_row, a_col)
+    handle_b = B.handle(b_row, b_col)
+    handle_c = C.handle(c_row, c_col)
+    d_a = A.device_array(a_row, a_col)
+    d_b = B.device_array(b_row, b_col)
+    d_c = C.device_array(c_row, c_col)
+    devid = C.get_preferred_devid(c_row, c_col)
+    
+    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()) as t:
+        # STF automatically sets the current device
+        stream_ptr = t.stream_ptr()
+        cp_stream = cp.cuda.ExternalStream(stream_ptr)
+        
+        with cp_stream:
+            # Apply transposes
+            op_a = d_a.T if transa != 'N' else d_a
+            op_b = d_b.T if transb != 'N' else d_b
+            
+            # C = alpha * op(A) @ op(B) + beta * C (IN PLACE)
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_b)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_b)
+            else:
+                d_c[:] = alpha * (op_a @ op_b) + beta * d_c
+
+def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0, beta=1.0):
+    """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
+    handle_a = A.handle(a_row, a_col)
+    handle_c = C.handle(c_row, c_col)
+    d_a = A.device_array(a_row, a_col)
+    d_c = C.device_array(c_row, c_col)
+    devid = C.get_preferred_devid(c_row, c_col)
+    
+    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t:
+        # STF automatically sets the current device
+        stream_ptr = t.stream_ptr()
+        cp_stream = cp.cuda.ExternalStream(stream_ptr)
+        
+        with cp_stream:
+            # Apply transpose
+            op_a = d_a.T if trans != 'N' else d_a
+            
+            # C = alpha * op(A) @ op(A).T + beta * C (IN PLACE)
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_a.T)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_a.T)
+            else:
+                d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c
+
+
+# High-level algorithms
+
+def PDPOTRF(ctx, A):
+    """Parallel tiled Cholesky factorization (blocked algorithm)"""
+    print(f"\n[PDPOTRF] Starting Cholesky factorization...")
+    
+    assert A.m == A.n, "Matrix must be square"
+    assert A.mt == A.nt, "Block grid must be square"
+    assert A.sym_matrix, "Matrix must be symmetric"
+    
+    nblocks = A.mt
+    
+    for k in range(nblocks):
+        # Factor diagonal block
+        DPOTRF(ctx, A, k, k)
+        
+        # Solve triangular systems for blocks in column k
+        for row in range(k + 1, nblocks):
+            DTRSM(ctx, A, k, k, A, row, k, side='R', uplo='L', transa='T', diag='N', alpha=1.0)
+            
+            # Update trailing matrix
+            for col in range(k + 1, row):
+                DGEMM(ctx, A, row, k, A, col, k, A, row, col, transa='N', transb='T', alpha=-1.0, beta=1.0)
+            
+            # Symmetric rank-k update of diagonal block
+            DSYRK(ctx, A, row, k, A, row, row, uplo='L', trans='N', alpha=-1.0, beta=1.0)
+    
+    print(f"[PDPOTRF] Completed")
+
+def PDTRSM(ctx, A, B, side='L', uplo='L', trans='N', diag='N', alpha=1.0):
+    """Parallel tiled triangular solve"""
+    print(f"\n[PDTRSM] Starting triangular solve...")
+    
+    if side == 'L':
+        if uplo == 'L':
+            if trans == 'N':
+                # Forward substitution
+                for k in range(B.mt):
+                    lalpha = alpha if k == 0 else 1.0
+                    for n in range(B.nt):
+                        DTRSM(ctx, A, k, k, B, k, n, side='L', uplo='L', transa='N', diag=diag, alpha=lalpha)
+                    for m in range(k + 1, B.mt):
+                        for n in range(B.nt):
+                            DGEMM(ctx, A, m, k, B, k, n, B, m, n, transa='N', transb='N', alpha=-1.0, beta=lalpha)
+            else:  # trans == 'T' or 'C'
+                # Backward substitution
+                for k in range(B.mt):
+                    lalpha = alpha if k == 0 else 1.0
+                    for n in range(B.nt):
+                        DTRSM(ctx, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n, 
+                              side='L', uplo='L', transa='T', diag=diag, alpha=lalpha)
+                    for m in range(k + 1, B.mt):
+                        for n in range(B.nt):
+                            DGEMM(ctx, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, 
+                                  B, B.mt - 1 - m, n, transa='T', transb='N', alpha=-1.0, beta=lalpha)
+    
+    print(f"[PDTRSM] Completed")
+
+def PDPOTRS(ctx, A, B, uplo='L'):
+    """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)"""
+    print(f"\n[PDPOTRS] Solving linear system...")
+    
+    # First solve: L @ Y = B
+    PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='N' if uplo == 'L' else 'T', diag='N', alpha=1.0)
+    
+    # Second solve: L.T @ X = Y
+    PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='T' if uplo == 'L' else 'N', diag='N', alpha=1.0)
+    
+    print(f"[PDPOTRS] Completed")
+
+def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0):
+    """Parallel tiled matrix multiplication"""
+    print(f"\n[PDGEMM] Starting matrix multiplication...")
+    
+    for m in range(C.mt):
+        for n in range(C.nt):
+            inner_k = A.nt if transa == 'N' else A.mt
+            
+            if alpha == 0.0 or inner_k == 0:
+                # Just scale C
+                DGEMM(ctx, A, 0, 0, B, 0, 0, C, m, n, transa=transa, transb=transb, alpha=0.0, beta=beta)
+            elif transa == 'N':
+                if transb == 'N':
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A, m, k, B, k, n, C, m, n, transa='N', transb='N', alpha=alpha, beta=zbeta)
+                else:
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A, m, k, B, n, k, C, m, n, transa='N', transb='T', alpha=alpha, beta=zbeta)
+            else:
+                if transb == 'N':
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A, k, m, B, k, n, C, m, n, transa='T', transb='N', alpha=alpha, beta=zbeta)
+                else:
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A, k, m, B, n, k, C, m, n, transa='T', transb='T', alpha=alpha, beta=zbeta)
+    
+    print(f"[PDGEMM] Completed")
+
+def compute_norm(matrix):
+    """Compute Frobenius norm of matrix"""
+    norm_sq = 0.0
+    for colb in range(matrix.nt):
+        low_rowb = colb if matrix.sym_matrix else 0
+        for rowb in range(low_rowb, matrix.mt):
+            d_block = matrix.device_array(rowb, colb)
+            norm_sq += float(cp.sum(d_block * d_block))
+    return np.sqrt(norm_sq)
+
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Tiled Cholesky decomposition with CUDA STF')
+    parser.add_argument('N', type=int, nargs='?', default=1024, help='Matrix size (default: 1024)')
+    parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)')
+    parser.add_argument('--check', action='store_true', help='Check result (slower)')
+    args = parser.parse_args()
+    
+    N = args.N
+    NB = args.NB
+    check_result = args.check
+    
+    assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
+    
+    print("="*60)
+    print("Tiled Cholesky Decomposition with CUDA STF + CuPy")
+    print("="*60)
+    print(f"Matrix size: {N}x{N}")
+    print(f"Block size: {NB}x{NB}")
+    print(f"Number of blocks: {N//NB}x{N//NB}")
+    print(f"Check result: {check_result}")
+    print("="*60)
+    
+    # Create STF context
+    ctx = stf.context()
+    
+    # Create matrices
+    A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
+    
+    if check_result:
+        Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
+    
+    # Fill with Hilbert matrix + diagonal dominance
+    # H_{i,j} = 1/(i+j+1) + 2*N if i==j
+    def hilbert(row, col):
+        return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0)
+    
+    print("\n" + "="*60)
+    print("Initializing matrices...")
+    print("="*60)
+    
+    A.fill(hilbert)
+    if check_result:
+        Aref.fill(hilbert)
+    
+    # Create right-hand side
+    if check_result:
+        B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B")
+        Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref")
+        
+        def rhs_vals(row, col):
+            return 1.0 * (row + 1)
+        
+        B.fill(rhs_vals)
+        Bref.fill(rhs_vals)
+        
+        # Compute ||B|| for residual calculation
+        Bref_norm = compute_norm(Bref)
+    
+    # Synchronize before timing
+    cp.cuda.runtime.deviceSynchronize()
+    
+    # Record start time
+    start_event = cp.cuda.Event()
+    stop_event = cp.cuda.Event()
+    start_event.record()
+    
+    # Perform Cholesky factorization
+    print("\n" + "="*60)
+    print("Performing Cholesky factorization...")
+    print("="*60)
+    PDPOTRF(ctx, A)
+    
+    # Record stop time
+    stop_event.record()
+    
+    # Solve system if checking
+    if check_result:
+        print("\n" + "="*60)
+        print("Solving linear system...")
+        print("="*60)
+        PDPOTRS(ctx, A, B, uplo='L')
+        
+        print("\n" + "="*60)
+        print("Computing residual...")
+        print("="*60)
+        # Compute residual: Bref = Aref @ B - Bref
+        PDGEMM(ctx, Aref, B, Bref, transa='N', transb='N', alpha=1.0, beta=-1.0)
+        
+        # Compute ||residual||
+        res_norm = compute_norm(Bref)
+    
+    # Finalize STF context
+    print("\n" + "="*60)
+    print("Finalizing STF context...")
+    print("="*60)
+    ctx.finalize()
+    
+    # Wait for completion
+    stop_event.synchronize()
+    
+    # Compute timing
+    elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event)
+    gflops = (1.0/3.0 * N * N * N) / 1e9
+    gflops_per_sec = gflops / (elapsed_ms / 1000.0)
+    
+    print("\n" + "="*60)
+    print("Results")
+    print("="*60)
+    print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms")
+    print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS")
+    
+    if check_result:
+        residual = res_norm / Bref_norm
+        print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}")
+        print(f"[POTRS] ||B||: {Bref_norm:.6e}")
+        print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}")
+        
+        if residual >= 0.01:
+            print("\n❌ Algorithm did not converge (residual >= 0.01)")
+            return 1
+        else:
+            print("\n✅ Algorithm converged successfully!")
+    
+    print("="*60)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+

From 5a3288188cb2ad0561a9d4db44be93e6b7a5615b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 9 Oct 2025 00:26:16 +0200
Subject: [PATCH 196/221] improve cholesky example

---
 .../cuda_cccl/tests/stf/example_cholesky.py   | 737 ++++++++++++------
 1 file changed, 490 insertions(+), 247 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py
index d696967c3e7..faef64d1d73 100755
--- a/python/cuda_cccl/tests/stf/example_cholesky.py
+++ b/python/cuda_cccl/tests/stf/example_cholesky.py
@@ -13,22 +13,84 @@
 """
 
 import sys
-import numpy as np
+
 import cupy as cp
+import numpy as np
 from cupyx.scipy import linalg as cp_linalg
+
 import cuda.stf as stf
 
 
+class CAIWrapper:
+    """Wrapper to expose CUDA Array Interface dict as a proper CAI object."""
+
+    def __init__(self, cai_dict):
+        self.__cuda_array_interface__ = cai_dict
+
+
+def get_cupy_arrays(task):
+    """
+    Get all CuPy arrays from STF task arguments.
+
+    Usage:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+    """
+    arrays = []
+    idx = 0
+    while True:
+        try:
+            arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
+            idx += 1
+        except:
+            break
+    return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
+
+
+def cai_to_numpy(cai_dict):
+    """Convert CUDA Array Interface dict to NumPy array (for host memory)."""
+    import ctypes
+
+    # Extract CAI fields
+    data_ptr, readonly = cai_dict["data"]
+    shape = cai_dict["shape"]
+    typestr = cai_dict["typestr"]
+
+    # Convert typestr to NumPy dtype
+    dtype = np.dtype(typestr)
+
+    # Calculate total size in bytes
+    itemsize = dtype.itemsize
+    size = np.prod(shape) * itemsize
+
+    # Create ctypes buffer from pointer
+    buffer = (ctypes.c_byte * size).from_address(data_ptr)
+
+    # Create NumPy array from buffer
+    arr = np.frombuffer(buffer, dtype=dtype).reshape(shape)
+
+    return arr
+
+
 class TiledMatrix:
     """
     Tiled matrix class that splits a matrix into blocks for parallel processing.
     Each block is managed as an STF logical data object.
     """
-    
-    def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False, symbol="matrix", dtype=np.float64):
+
+    def __init__(
+        self,
+        ctx,
+        nrows,
+        ncols,
+        block_rows,
+        block_cols,
+        is_symmetric=False,
+        symbol="matrix",
+        dtype=np.float64,
+    ):
         """
         Initialize a tiled matrix.
-        
+
         Args:
             ctx: STF context
             nrows: Total number of rows
@@ -42,54 +104,49 @@ def __init__(self, ctx, nrows, ncols, block_rows, block_cols, is_symmetric=False
         self.ctx = ctx
         self.symbol = symbol
         self.dtype = dtype
-        
+
         self.m = nrows
         self.n = ncols
         self.mb = block_rows
         self.nb = block_cols
         self.sym_matrix = is_symmetric
-        
-        assert self.m % self.mb == 0, f"nrows ({self.m}) must be divisible by block_rows ({self.mb})"
-        assert self.n % self.nb == 0, f"ncols ({self.n}) must be divisible by block_cols ({self.nb})"
-        
+
+        assert self.m % self.mb == 0, (
+            f"nrows ({self.m}) must be divisible by block_rows ({self.mb})"
+        )
+        assert self.n % self.nb == 0, (
+            f"ncols ({self.n}) must be divisible by block_cols ({self.nb})"
+        )
+
         # Number of blocks
         self.mt = self.m // self.mb
         self.nt = self.n // self.nb
-        
+
         # Allocate host memory (pinned for faster transfers)
-        self.h_array = cp.cuda.alloc_pinned_memory(self.m * self.n * np.dtype(dtype).itemsize)
-        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(self.m, self.n)
-        
+        self.h_array = cp.cuda.alloc_pinned_memory(
+            self.m * self.n * np.dtype(dtype).itemsize
+        )
+        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(
+            self.m, self.n
+        )
+
         # Create logical data handles for each block
         self.handles = {}
-        
+
         # Get available devices for mapping
         self.ndevs = cp.cuda.runtime.getDeviceCount()
         self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
-        
-        print(f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}")
-        print(f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid")
-        
-        # Create blocks
-        for colb in range(self.nt):
-            low_rowb = colb if self.sym_matrix else 0
-            for rowb in range(low_rowb, self.mt):
-                # Get tile data from host array (using tiled storage)
-                tile_data = self._get_block_h(rowb, colb)
-                
-                # Create CuPy array on the preferred device
-                devid = self.get_preferred_devid(rowb, colb)
-                with cp.cuda.Device(devid):
-                    # Create device array for this block
-                    d_block = cp.asarray(tile_data)
-                    
-                    # Create STF logical data
-                    device_place = stf.data_place.device(devid)
-                    handle = self.ctx.logical_data(d_block, device_place)
-                    handle.set_symbol(f"{symbol}_{rowb}_{colb}")
-                    
-                    self.handles[(rowb, colb)] = (handle, d_block)
-    
+
+        print(
+            f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}"
+        )
+        print(
+            f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid"
+        )
+
+        # Note: We DON'T create logical data here yet - that happens in fill()
+        # after the host data is initialized
+
     def _compute_device_grid(self, ndevs):
         """Compute 2D device grid dimensions (as close to square as possible)"""
         grid_p = 1
@@ -99,35 +156,31 @@ def _compute_device_grid(self, ndevs):
                 grid_p = a
                 grid_q = ndevs // a
         return grid_p, grid_q
-    
+
     def get_preferred_devid(self, row, col):
         """Get preferred device ID for a given block using cyclic distribution"""
         return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
-    
+
     def handle(self, row, col):
         """Get the logical data handle for block (row, col)"""
-        return self.handles[(row, col)][0]
-    
-    def device_array(self, row, col):
-        """Get the CuPy device array for block (row, col)"""
-        return self.handles[(row, col)][1]
-    
+        return self.handles[(row, col)]
+
     def _get_index(self, row, col):
         """Convert (row, col) to linear index in tiled storage"""
         # Find which tile contains this element
         tile_row = row // self.mb
         tile_col = col // self.nb
-        
+
         tile_size = self.mb * self.nb
-        
+
         # Index of the beginning of the tile
         tile_start = (tile_row + self.mt * tile_col) * tile_size
-        
+
         # Offset within the tile
         offset = (row % self.mb) + (col % self.nb) * self.mb
-        
+
         return tile_start + offset
-    
+
     def _get_block_h(self, brow, bcol):
         """Get a view of the host data for block (brow, bcol)"""
         # For tiled storage, blocks are stored contiguously
@@ -135,120 +188,114 @@ def _get_block_h(self, brow, bcol):
         end_idx = start_idx + self.mb * self.nb
         flat_view = self.h_array_np.ravel()
         return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
-    
+
     def fill(self, func):
-        """Fill matrix using a function func(row, col) -> value"""
-        print(f"[{self.symbol}] Filling matrix...")
-        
+        """Fill matrix on host, then create STF logical data that will transfer automatically"""
+        print(f"[{self.symbol}] Filling matrix on host...")
+
         for colb in range(self.nt):
             low_rowb = colb if self.sym_matrix else 0
             for rowb in range(low_rowb, self.mt):
-                devid = self.get_preferred_devid(rowb, colb)
-                handle = self.handle(rowb, colb)
-                d_array = self.device_array(rowb, colb)
-                
-                # Fill on host then copy to device
+                # Fill host block
                 h_block = self._get_block_h(rowb, colb)
                 for lrow in range(self.mb):
                     for lcol in range(self.nb):
                         row = lrow + rowb * self.mb
                         col = lcol + colb * self.nb
                         h_block[lrow, lcol] = func(row, col)
-                
-                # Copy to device
-                with cp.cuda.Device(devid):
-                    cp.copyto(d_array, cp.asarray(h_block))
-    
-    def finalize(self):
-        """Copy all blocks back to host memory"""
-        print(f"[{self.symbol}] Finalizing (copying back to host)...")
-        for colb in range(self.nt):
-            low_rowb = colb if self.sym_matrix else 0
-            for rowb in range(low_rowb, self.mt):
-                devid = self.get_preferred_devid(rowb, colb)
-                d_array = self.device_array(rowb, colb)
-                h_block = self._get_block_h(rowb, colb)
-                
-                with cp.cuda.Device(devid):
-                    cp.copyto(h_block, cp.asnumpy(d_array))
+
+                handle = self.ctx.logical_data(h_block)
+                handle.set_symbol(f"{self.symbol}_{rowb}_{colb}")
+
+                self.handles[(rowb, colb)] = handle
 
 
 # BLAS/LAPACK operations wrapped in STF tasks
 
+
 def DPOTRF(ctx, A, row, col):
     """Cholesky factorization of block (row, col) using CUSOLVER"""
     handle = A.handle(row, col)
-    d_block = A.device_array(row, col)
     devid = A.get_preferred_devid(row, col)
-    
+
     with ctx.task(stf.exec_place.device(devid), handle.rw()) as t:
-        # STF automatically sets the current device, just use the stream
-        stream_ptr = t.stream_ptr()
-        cp_stream = cp.cuda.ExternalStream(stream_ptr)
-        
-        with cp_stream:
-            # Perform Cholesky factorization (lower triangular) - IN PLACE
-            # CuPy's cholesky returns L where A = L @ L.T
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
             d_block[:] = cp.linalg.cholesky(d_block)
 
-def DTRSM(ctx, A, a_row, a_col, B, b_row, b_col, side='L', uplo='L', transa='T', diag='N', alpha=1.0):
+
+def DTRSM(
+    ctx,
+    A,
+    a_row,
+    a_col,
+    B,
+    b_row,
+    b_col,
+    side="L",
+    uplo="L",
+    transa="T",
+    diag="N",
+    alpha=1.0,
+):
     """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}"""
     handle_a = A.handle(a_row, a_col)
     handle_b = B.handle(b_row, b_col)
-    d_a = A.device_array(a_row, a_col)
-    d_b = B.device_array(b_row, b_col)
     devid = B.get_preferred_devid(b_row, b_col)
-    
+
     with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t:
-        # STF automatically sets the current device
-        stream_ptr = t.stream_ptr()
-        cp_stream = cp.cuda.ExternalStream(stream_ptr)
-        
-        with cp_stream:
-            # Triangular solve using CuPy
-            # For side='L': solve op(A) @ X = B for X, then X = alpha * X
-            if side == 'L':
-                if transa == 'N':
-                    # Solve A @ X = B where A is lower/upper triangular
-                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == 'L'))
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            if side == "L":
+                if transa == "N":
+                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == "L"))
                 else:
-                    # Solve A^T @ X = B where A is lower triangular
-                    # This is equivalent to solving U @ X = B where U = A^T is upper
-                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != 'L'))
+                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != "L"))
                 if alpha != 1.0:
                     d_b *= alpha
             else:
-                # For side='R': solve X @ op(A) = B
-                # Rewrite as op(A)^T @ X^T = B^T
-                if transa == 'N':
-                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b.T, lower=(uplo != 'L')).T
+                if transa == "N":
+                    d_b[:] = cp_linalg.solve_triangular(
+                        d_a.T, d_b.T, lower=(uplo != "L")
+                    ).T
                 else:
-                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b.T, lower=(uplo == 'L')).T
+                    d_b[:] = cp_linalg.solve_triangular(
+                        d_a, d_b.T, lower=(uplo == "L")
+                    ).T
                 if alpha != 1.0:
                     d_b *= alpha
 
-def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col, 
-          transa='N', transb='N', alpha=1.0, beta=1.0):
+
+def DGEMM(
+    ctx,
+    A,
+    a_row,
+    a_col,
+    B,
+    b_row,
+    b_col,
+    C,
+    c_row,
+    c_col,
+    transa="N",
+    transb="N",
+    alpha=1.0,
+    beta=1.0,
+):
     """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C"""
     handle_a = A.handle(a_row, a_col)
     handle_b = B.handle(b_row, b_col)
     handle_c = C.handle(c_row, c_col)
-    d_a = A.device_array(a_row, a_col)
-    d_b = B.device_array(b_row, b_col)
-    d_c = C.device_array(c_row, c_col)
     devid = C.get_preferred_devid(c_row, c_col)
-    
-    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()) as t:
-        # STF automatically sets the current device
-        stream_ptr = t.stream_ptr()
-        cp_stream = cp.cuda.ExternalStream(stream_ptr)
-        
-        with cp_stream:
-            # Apply transposes
-            op_a = d_a.T if transa != 'N' else d_a
-            op_b = d_b.T if transb != 'N' else d_b
-            
-            # C = alpha * op(A) @ op(B) + beta * C (IN PLACE)
+
+    with ctx.task(
+        stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()
+    ) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if transa != "N" else d_a
+            op_b = d_b.T if transb != "N" else d_b
+
             if beta == 0.0:
                 d_c[:] = alpha * (op_a @ op_b)
             elif beta == 1.0:
@@ -256,24 +303,20 @@ def DGEMM(ctx, A, a_row, a_col, B, b_row, b_col, C, c_row, c_col,
             else:
                 d_c[:] = alpha * (op_a @ op_b) + beta * d_c
 
-def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0, beta=1.0):
+
+def DSYRK(
+    ctx, A, a_row, a_col, C, c_row, c_col, uplo="L", trans="N", alpha=1.0, beta=1.0
+):
     """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
     handle_a = A.handle(a_row, a_col)
     handle_c = C.handle(c_row, c_col)
-    d_a = A.device_array(a_row, a_col)
-    d_c = C.device_array(c_row, c_col)
     devid = C.get_preferred_devid(c_row, c_col)
-    
+
     with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t:
-        # STF automatically sets the current device
-        stream_ptr = t.stream_ptr()
-        cp_stream = cp.cuda.ExternalStream(stream_ptr)
-        
-        with cp_stream:
-            # Apply transpose
-            op_a = d_a.T if trans != 'N' else d_a
-            
-            # C = alpha * op(A) @ op(A).T + beta * C (IN PLACE)
+        d_a, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if trans != "N" else d_a
+
             if beta == 0.0:
                 d_c[:] = alpha * (op_a @ op_a.T)
             elif beta == 1.0:
@@ -284,246 +327,446 @@ def DSYRK(ctx, A, a_row, a_col, C, c_row, c_col, uplo='L', trans='N', alpha=1.0,
 
 # High-level algorithms
 
+
 def PDPOTRF(ctx, A):
     """Parallel tiled Cholesky factorization (blocked algorithm)"""
-    print(f"\n[PDPOTRF] Starting Cholesky factorization...")
-    
+    print("\n[PDPOTRF] Starting Cholesky factorization...")
+
     assert A.m == A.n, "Matrix must be square"
     assert A.mt == A.nt, "Block grid must be square"
     assert A.sym_matrix, "Matrix must be symmetric"
-    
+
     nblocks = A.mt
-    
+
     for k in range(nblocks):
         # Factor diagonal block
         DPOTRF(ctx, A, k, k)
-        
+
         # Solve triangular systems for blocks in column k
         for row in range(k + 1, nblocks):
-            DTRSM(ctx, A, k, k, A, row, k, side='R', uplo='L', transa='T', diag='N', alpha=1.0)
-            
+            DTRSM(
+                ctx,
+                A,
+                k,
+                k,
+                A,
+                row,
+                k,
+                side="R",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
             # Update trailing matrix
             for col in range(k + 1, row):
-                DGEMM(ctx, A, row, k, A, col, k, A, row, col, transa='N', transb='T', alpha=-1.0, beta=1.0)
-            
+                DGEMM(
+                    ctx,
+                    A,
+                    row,
+                    k,
+                    A,
+                    col,
+                    k,
+                    A,
+                    row,
+                    col,
+                    transa="N",
+                    transb="T",
+                    alpha=-1.0,
+                    beta=1.0,
+                )
+
             # Symmetric rank-k update of diagonal block
-            DSYRK(ctx, A, row, k, A, row, row, uplo='L', trans='N', alpha=-1.0, beta=1.0)
-    
-    print(f"[PDPOTRF] Completed")
+            DSYRK(
+                ctx, A, row, k, A, row, row, uplo="L", trans="N", alpha=-1.0, beta=1.0
+            )
+
+    print("[PDPOTRF] Completed")
+
 
-def PDTRSM(ctx, A, B, side='L', uplo='L', trans='N', diag='N', alpha=1.0):
+def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
     """Parallel tiled triangular solve"""
-    print(f"\n[PDTRSM] Starting triangular solve...")
-    
-    if side == 'L':
-        if uplo == 'L':
-            if trans == 'N':
+    print("\n[PDTRSM] Starting triangular solve...")
+
+    if side == "L":
+        if uplo == "L":
+            if trans == "N":
                 # Forward substitution
                 for k in range(B.mt):
                     lalpha = alpha if k == 0 else 1.0
                     for n in range(B.nt):
-                        DTRSM(ctx, A, k, k, B, k, n, side='L', uplo='L', transa='N', diag=diag, alpha=lalpha)
+                        DTRSM(
+                            ctx,
+                            A,
+                            k,
+                            k,
+                            B,
+                            k,
+                            n,
+                            side="L",
+                            uplo="L",
+                            transa="N",
+                            diag=diag,
+                            alpha=lalpha,
+                        )
                     for m in range(k + 1, B.mt):
                         for n in range(B.nt):
-                            DGEMM(ctx, A, m, k, B, k, n, B, m, n, transa='N', transb='N', alpha=-1.0, beta=lalpha)
+                            DGEMM(
+                                ctx,
+                                A,
+                                m,
+                                k,
+                                B,
+                                k,
+                                n,
+                                B,
+                                m,
+                                n,
+                                transa="N",
+                                transb="N",
+                                alpha=-1.0,
+                                beta=lalpha,
+                            )
             else:  # trans == 'T' or 'C'
                 # Backward substitution
                 for k in range(B.mt):
                     lalpha = alpha if k == 0 else 1.0
                     for n in range(B.nt):
-                        DTRSM(ctx, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n, 
-                              side='L', uplo='L', transa='T', diag=diag, alpha=lalpha)
+                        DTRSM(
+                            ctx,
+                            A,
+                            B.mt - k - 1,
+                            B.mt - k - 1,
+                            B,
+                            B.mt - k - 1,
+                            n,
+                            side="L",
+                            uplo="L",
+                            transa="T",
+                            diag=diag,
+                            alpha=lalpha,
+                        )
                     for m in range(k + 1, B.mt):
                         for n in range(B.nt):
-                            DGEMM(ctx, A, B.mt - k - 1, B.mt - 1 - m, B, B.mt - k - 1, n, 
-                                  B, B.mt - 1 - m, n, transa='T', transb='N', alpha=-1.0, beta=lalpha)
-    
-    print(f"[PDTRSM] Completed")
+                            DGEMM(
+                                ctx,
+                                A,
+                                B.mt - k - 1,
+                                B.mt - 1 - m,
+                                B,
+                                B.mt - k - 1,
+                                n,
+                                B,
+                                B.mt - 1 - m,
+                                n,
+                                transa="T",
+                                transb="N",
+                                alpha=-1.0,
+                                beta=lalpha,
+                            )
+
+    print("[PDTRSM] Completed")
+
 
-def PDPOTRS(ctx, A, B, uplo='L'):
+def PDPOTRS(ctx, A, B, uplo="L"):
     """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)"""
-    print(f"\n[PDPOTRS] Solving linear system...")
-    
+    print("\n[PDPOTRS] Solving linear system...")
+
     # First solve: L @ Y = B
-    PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='N' if uplo == 'L' else 'T', diag='N', alpha=1.0)
-    
+    PDTRSM(
+        ctx,
+        A,
+        B,
+        side="L",
+        uplo=uplo,
+        trans="N" if uplo == "L" else "T",
+        diag="N",
+        alpha=1.0,
+    )
+
     # Second solve: L.T @ X = Y
-    PDTRSM(ctx, A, B, side='L', uplo=uplo, trans='T' if uplo == 'L' else 'N', diag='N', alpha=1.0)
-    
-    print(f"[PDPOTRS] Completed")
+    PDTRSM(
+        ctx,
+        A,
+        B,
+        side="L",
+        uplo=uplo,
+        trans="T" if uplo == "L" else "N",
+        diag="N",
+        alpha=1.0,
+    )
+
+    print("[PDPOTRS] Completed")
+
 
-def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0):
+def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
     """Parallel tiled matrix multiplication"""
-    print(f"\n[PDGEMM] Starting matrix multiplication...")
-    
+    print("\n[PDGEMM] Starting matrix multiplication...")
+
     for m in range(C.mt):
         for n in range(C.nt):
-            inner_k = A.nt if transa == 'N' else A.mt
-            
+            inner_k = A.nt if transa == "N" else A.mt
+
             if alpha == 0.0 or inner_k == 0:
                 # Just scale C
-                DGEMM(ctx, A, 0, 0, B, 0, 0, C, m, n, transa=transa, transb=transb, alpha=0.0, beta=beta)
-            elif transa == 'N':
-                if transb == 'N':
+                DGEMM(
+                    ctx,
+                    A,
+                    0,
+                    0,
+                    B,
+                    0,
+                    0,
+                    C,
+                    m,
+                    n,
+                    transa=transa,
+                    transb=transb,
+                    alpha=0.0,
+                    beta=beta,
+                )
+            elif transa == "N":
+                if transb == "N":
                     for k in range(A.nt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A, m, k, B, k, n, C, m, n, transa='N', transb='N', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A,
+                            m,
+                            k,
+                            B,
+                            k,
+                            n,
+                            C,
+                            m,
+                            n,
+                            transa="N",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
                 else:
                     for k in range(A.nt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A, m, k, B, n, k, C, m, n, transa='N', transb='T', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A,
+                            m,
+                            k,
+                            B,
+                            n,
+                            k,
+                            C,
+                            m,
+                            n,
+                            transa="N",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
             else:
-                if transb == 'N':
+                if transb == "N":
                     for k in range(A.mt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A, k, m, B, k, n, C, m, n, transa='T', transb='N', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A,
+                            k,
+                            m,
+                            B,
+                            k,
+                            n,
+                            C,
+                            m,
+                            n,
+                            transa="T",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
                 else:
                     for k in range(A.mt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A, k, m, B, n, k, C, m, n, transa='T', transb='T', alpha=alpha, beta=zbeta)
-    
-    print(f"[PDGEMM] Completed")
+                        DGEMM(
+                            ctx,
+                            A,
+                            k,
+                            m,
+                            B,
+                            n,
+                            k,
+                            C,
+                            m,
+                            n,
+                            transa="T",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+
+    print("[PDGEMM] Completed")
 
-def compute_norm(matrix):
-    """Compute Frobenius norm of matrix"""
+
+def compute_norm(ctx, matrix):
+    """Compute Frobenius norm of matrix using host tasks"""
     norm_sq = 0.0
+
     for colb in range(matrix.nt):
         low_rowb = colb if matrix.sym_matrix else 0
         for rowb in range(low_rowb, matrix.mt):
-            d_block = matrix.device_array(rowb, colb)
-            norm_sq += float(cp.sum(d_block * d_block))
+            handle = matrix.handle(rowb, colb)
+
+            # Host task to read the block and compute norm
+            def compute_block_norm(h_block):
+                nonlocal norm_sq
+                norm_sq += np.sum(h_block * h_block)
+
+            with ctx.task(stf.exec_place.host(), handle.read()) as t:
+                # Synchronize the stream before reading data
+                cp.cuda.runtime.streamSynchronize(t.stream_ptr())
+
+                h_block = cai_to_numpy(t.get_arg_cai(0))
+                compute_block_norm(h_block)
+
     return np.sqrt(norm_sq)
 
 
 def main():
     import argparse
-    
-    parser = argparse.ArgumentParser(description='Tiled Cholesky decomposition with CUDA STF')
-    parser.add_argument('N', type=int, nargs='?', default=1024, help='Matrix size (default: 1024)')
-    parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)')
-    parser.add_argument('--check', action='store_true', help='Check result (slower)')
+
+    parser = argparse.ArgumentParser(
+        description="Tiled Cholesky decomposition with CUDA STF"
+    )
+    parser.add_argument(
+        "N", type=int, nargs="?", default=1024, help="Matrix size (default: 1024)"
+    )
+    parser.add_argument(
+        "NB", type=int, nargs="?", default=128, help="Block size (default: 128)"
+    )
+    parser.add_argument("--check", action="store_true", help="Check result (slower)")
     args = parser.parse_args()
-    
+
     N = args.N
     NB = args.NB
     check_result = args.check
-    
+
     assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
-    
-    print("="*60)
+
+    print("=" * 60)
     print("Tiled Cholesky Decomposition with CUDA STF + CuPy")
-    print("="*60)
+    print("=" * 60)
     print(f"Matrix size: {N}x{N}")
     print(f"Block size: {NB}x{NB}")
-    print(f"Number of blocks: {N//NB}x{N//NB}")
+    print(f"Number of blocks: {N // NB}x{N // NB}")
     print(f"Check result: {check_result}")
-    print("="*60)
-    
+    print("=" * 60)
+
     # Create STF context
     ctx = stf.context()
-    
+
     # Create matrices
     A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
-    
+
     if check_result:
         Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
-    
+
     # Fill with Hilbert matrix + diagonal dominance
     # H_{i,j} = 1/(i+j+1) + 2*N if i==j
     def hilbert(row, col):
         return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0)
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Initializing matrices...")
-    print("="*60)
-    
+    print("=" * 60)
+
     A.fill(hilbert)
     if check_result:
         Aref.fill(hilbert)
-    
+
     # Create right-hand side
     if check_result:
         B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B")
         Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref")
-        
+
         def rhs_vals(row, col):
             return 1.0 * (row + 1)
-        
+
         B.fill(rhs_vals)
         Bref.fill(rhs_vals)
-        
+
         # Compute ||B|| for residual calculation
-        Bref_norm = compute_norm(Bref)
-    
+        Bref_norm = compute_norm(ctx, Bref)
+
     # Synchronize before timing
     cp.cuda.runtime.deviceSynchronize()
-    
+
     # Record start time
     start_event = cp.cuda.Event()
     stop_event = cp.cuda.Event()
     start_event.record()
-    
+
     # Perform Cholesky factorization
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("Performing Cholesky factorization...")
-    print("="*60)
+    print("=" * 60)
     PDPOTRF(ctx, A)
-    
+
     # Record stop time
     stop_event.record()
-    
+
     # Solve system if checking
     if check_result:
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Solving linear system...")
-        print("="*60)
-        PDPOTRS(ctx, A, B, uplo='L')
-        
-        print("\n" + "="*60)
+        print("=" * 60)
+        PDPOTRS(ctx, A, B, uplo="L")
+
+        print("\n" + "=" * 60)
         print("Computing residual...")
-        print("="*60)
+        print("=" * 60)
         # Compute residual: Bref = Aref @ B - Bref
-        PDGEMM(ctx, Aref, B, Bref, transa='N', transb='N', alpha=1.0, beta=-1.0)
-        
+        PDGEMM(ctx, Aref, B, Bref, transa="N", transb="N", alpha=1.0, beta=-1.0)
+
         # Compute ||residual||
-        res_norm = compute_norm(Bref)
-    
+        res_norm = compute_norm(ctx, Bref)
+
     # Finalize STF context
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("Finalizing STF context...")
-    print("="*60)
+    print("=" * 60)
     ctx.finalize()
-    
+
     # Wait for completion
     stop_event.synchronize()
-    
+
     # Compute timing
     elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event)
-    gflops = (1.0/3.0 * N * N * N) / 1e9
+    gflops = (1.0 / 3.0 * N * N * N) / 1e9
     gflops_per_sec = gflops / (elapsed_ms / 1000.0)
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Results")
-    print("="*60)
+    print("=" * 60)
     print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms")
     print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS")
-    
+
     if check_result:
         residual = res_norm / Bref_norm
         print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}")
         print(f"[POTRS] ||B||: {Bref_norm:.6e}")
         print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}")
-        
+
         if residual >= 0.01:
             print("\n❌ Algorithm did not converge (residual >= 0.01)")
             return 1
         else:
             print("\n✅ Algorithm converged successfully!")
-    
-    print("="*60)
+
+    print("=" * 60)
     return 0
 
 
 if __name__ == "__main__":
     sys.exit(main())
-

From abd577817bcecce9150c20842cabc8173cc69fa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 9 Oct 2025 15:04:36 +0200
Subject: [PATCH 197/221] POTRI and Cholesky

---
 python/cuda_cccl/tests/stf/example_potri.py | 629 ++++++++++++++++++++
 1 file changed, 629 insertions(+)
 create mode 100644 python/cuda_cccl/tests/stf/example_potri.py

diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py
new file mode 100644
index 00000000000..0be70fc9338
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_potri.py
@@ -0,0 +1,629 @@
+#!/usr/bin/env python3
+"""
+Python implementation of POTRI (matrix inversion via Cholesky) using CUDA STF and CuPy.
+
+POTRI computes the inverse of a symmetric positive definite matrix using its Cholesky factorization:
+1. Cholesky factorization: A = L*L^T
+2. Triangular inversion: L^(-1)
+3. Compute A^(-1) = L^(-T) * L^(-1)
+
+This example demonstrates:
+- Tiled matrix operations with STF logical data
+- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks
+- Multi-device execution with automatic data placement
+- Task-based parallelism for linear algebra operations
+"""
+
+import sys
+import numpy as np
+import cupy as cp
+from cupyx.scipy import linalg as cp_linalg
+import cuda.stf as stf
+
+
+class CAIWrapper:
+    """Wrapper to expose CUDA Array Interface dict as a proper CAI object."""
+    def __init__(self, cai_dict):
+        self.__cuda_array_interface__ = cai_dict
+
+
+def get_cupy_arrays(task):
+    """
+    Get all CuPy arrays from STF task arguments.
+    
+    Usage:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+    """
+    arrays = []
+    idx = 0
+    while True:
+        try:
+            arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
+            idx += 1
+        except:
+            break
+    return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
+
+
+def cai_to_numpy(cai_dict):
+    """Convert CUDA Array Interface dict to NumPy array (for host memory)."""
+    import ctypes
+    
+    # Extract CAI fields
+    data_ptr, readonly = cai_dict['data']
+    shape = cai_dict['shape']
+    typestr = cai_dict['typestr']
+    
+    # Convert typestr to NumPy dtype
+    dtype = np.dtype(typestr)
+    
+    # Calculate total size in bytes
+    itemsize = dtype.itemsize
+    size = np.prod(shape) * itemsize
+    
+    # Create ctypes buffer from pointer
+    buffer = (ctypes.c_byte * size).from_address(data_ptr)
+    
+    # Create NumPy array from buffer
+    arr = np.frombuffer(buffer, dtype=dtype).reshape(shape)
+    
+    return arr
+
+
+class BlockRef:
+    """Reference to a specific block in a tiled matrix."""
+    def __init__(self, matrix, row, col):
+        self.matrix = matrix
+        self.row = row
+        self.col = col
+        self._handle = matrix.handle(row, col)
+        self._devid = matrix.get_preferred_devid(row, col)
+    
+    def handle(self):
+        """Get the STF logical data handle for this block."""
+        return self._handle
+    
+    def devid(self):
+        """Get the preferred device ID for this block."""
+        return self._devid
+    
+    def __repr__(self):
+        return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])"
+
+
+class TiledMatrix:
+    """
+    Tiled matrix class that splits a matrix into blocks for parallel processing.
+    Each block is managed as an STF logical data object.
+    Uses tiled storage format for contiguous blocks.
+    """
+    def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols, 
+                 is_symmetric=False, symbol="matrix", dtype=np.float64):
+        self.ctx = ctx
+        self.symbol = symbol
+        self.dtype = dtype
+        self.sym_matrix = is_symmetric
+        
+        self.m = nrows
+        self.n = ncols
+        self.mb = blocksize_rows
+        self.nb = blocksize_cols
+        
+        assert self.m % self.mb == 0, f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}"
+        assert self.n % self.nb == 0, f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}"
+        
+        # Number of blocks
+        self.mt = self.m // self.mb
+        self.nt = self.n // self.nb
+        
+        # Allocate pinned host memory for faster transfers (in tiled format)
+        self.h_array = cp.cuda.alloc_pinned_memory(
+            self.m * self.n * np.dtype(dtype).itemsize
+        )
+        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(
+            self.m, self.n
+        )
+        
+        # Dictionary to store logical data handles for each block
+        self.handles = {}
+        
+        # Determine device layout
+        self.ndevs = cp.cuda.runtime.getDeviceCount()
+        self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
+        
+        print(f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}")
+        print(f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid")
+    
+    def _compute_device_grid(self, ndevs):
+        """Compute 2D device grid dimensions (as close to square as possible)"""
+        grid_p = 1
+        grid_q = ndevs
+        for a in range(1, int(np.sqrt(ndevs)) + 1):
+            if ndevs % a == 0:
+                grid_p = a
+                grid_q = ndevs // a
+        return grid_p, grid_q
+    
+    def get_preferred_devid(self, row, col):
+        """Get preferred device ID for a given block using cyclic distribution"""
+        return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
+    
+    def handle(self, row, col):
+        """Get the logical data handle for a block."""
+        return self.handles[(row, col)]
+    
+    def block(self, row, col):
+        """Get a BlockRef for block (row, col)"""
+        return BlockRef(self, row, col)
+    
+    def _get_index(self, row, col):
+        """Convert (row, col) to linear index in tiled storage"""
+        tile_row = row // self.mb
+        tile_col = col // self.nb
+        tile_size = self.mb * self.nb
+        tile_start = (tile_row + self.mt * tile_col) * tile_size
+        offset = (row % self.mb) + (col % self.nb) * self.mb
+        return tile_start + offset
+    
+    def _get_block_h(self, brow, bcol):
+        """Get a view of the host data for block (brow, bcol)"""
+        # For tiled storage, blocks are stored contiguously
+        start_idx = (brow + self.mt * bcol) * self.mb * self.nb
+        end_idx = start_idx + self.mb * self.nb
+        flat_view = self.h_array_np.ravel()
+        return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
+    
+    def fill(self, func):
+        """
+        Fill the matrix blocks using a function func(row, col) -> value.
+        Creates STF logical data from host arrays and lets STF handle transfers.
+        """
+        print(f"[{self.symbol}] Filling matrix on host...")
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                # Fill host block
+                h_block = self._get_block_h(rowb, colb)
+                for lrow in range(self.mb):
+                    for lcol in range(self.nb):
+                        row = lrow + rowb * self.mb
+                        col = lcol + colb * self.nb
+                        h_block[lrow, lcol] = func(row, col)
+                
+                handle = self.ctx.logical_data(h_block)
+                handle.set_symbol(f"{self.symbol}_{rowb}_{colb}")
+                
+                self.handles[(rowb, colb)] = handle
+
+
+# ============================================================================
+# Block-level operations (BLAS/LAPACK)
+# ============================================================================
+
+def DPOTRF(ctx, a):
+    """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            d_block[:] = cp.linalg.cholesky(d_block)
+
+
+def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0):
+    """Triangular solve: B = alpha * op(A)^(-1) * B"""
+    with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t:
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = (uplo == 'L')
+            trans = (transa != 'N')
+            result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans)
+            if alpha != 1.0:
+                d_b[:] = alpha * result
+            else:
+                d_b[:] = result
+
+
+def DTRTRI(ctx, a, uplo='L', diag='N'):
+    """Triangular matrix inversion: A = A^(-1)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = (uplo == 'L')
+            unit_diagonal = (diag == 'U')
+            # CuPy doesn't have trtri directly, use solve with identity
+            n = d_block.shape[0]
+            identity = cp.eye(n, dtype=d_block.dtype)
+            d_block[:] = cp_linalg.solve_triangular(d_block, identity, lower=lower, unit_diagonal=unit_diagonal)
+
+
+def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0):
+    """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C"""
+    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if transa != 'N' else d_a
+            op_b = d_b.T if transb != 'N' else d_b
+            
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_b)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_b)
+            else:
+                d_c[:] = alpha * (op_a @ op_b) + beta * d_c
+
+
+def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0):
+    """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
+    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()) as t:
+        d_a, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if trans != 'N' else d_a
+            
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_a.T)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_a.T)
+            else:
+                d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c
+
+
+def DTRMM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0):
+    """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')"""
+    with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t:
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = (uplo == 'L')
+            trans = (transa != 'N')
+            
+            # Extract triangle from A
+            if lower:
+                tri_a = cp.tril(d_a)
+            else:
+                tri_a = cp.triu(d_a)
+            
+            if trans:
+                tri_a = tri_a.T
+            
+            if side == 'L':
+                d_b[:] = alpha * (tri_a @ d_b)
+            else:  # side == 'R'
+                d_b[:] = alpha * (d_b @ tri_a)
+
+
+def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0):
+    """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R')
+    where A is symmetric."""
+    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            # Reconstruct full symmetric matrix from lower/upper triangle
+            if uplo == 'L':
+                # Lower triangle is stored
+                sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T
+            else:
+                # Upper triangle is stored
+                sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T
+            
+            if side == 'L':
+                result = alpha * (sym_a @ d_b)
+            else:  # side == 'R'
+                result = alpha * (d_b @ sym_a)
+            
+            if beta == 0.0:
+                d_c[:] = result
+            elif beta == 1.0:
+                d_c[:] += result
+            else:
+                d_c[:] = result + beta * d_c
+
+
+# ============================================================================
+# Tiled operations
+# ============================================================================
+
+def PDPOTRF(ctx, A, uplo='L'):
+    """Parallel tiled Cholesky factorization"""
+    print(f"\n[PDPOTRF] Starting Cholesky factorization...")
+    assert uplo == 'L', "Only lower triangular factorization supported"
+    
+    for k in range(A.nt):
+        # Factorize diagonal block
+        DPOTRF(ctx, A.block(k, k))
+        
+        # Update column below diagonal
+        for m in range(k + 1, A.mt):
+            DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='T', diag='N', alpha=1.0)
+        
+        # Update trailing submatrix
+        for n in range(k + 1, A.nt):
+            DSYRK(ctx, A.block(n, k), A.block(n, n), uplo='L', trans='N', alpha=-1.0, beta=1.0)
+            
+            for m in range(n + 1, A.mt):
+                DGEMM(ctx, A.block(m, k), A.block(n, k), A.block(m, n), transa='N', transb='T', alpha=-1.0, beta=1.0)
+    
+    print(f"[PDPOTRF] Completed")
+
+
+def PDTRTRI(ctx, A, uplo='L', diag='N'):
+    """Parallel tiled triangular matrix inversion"""
+    print(f"\n[PDTRTRI] Starting triangular inversion...")
+    assert uplo == 'L', "Only lower triangular inversion supported"
+    
+    for k in range(A.nt):
+        # Step 1: Update A[m,k] for m > k
+        for m in range(k + 1, A.mt):
+            DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='N', diag=diag, alpha=-1.0)
+        
+        # Step 2: Update A[m,n] for m > k, n < k
+        for m in range(k + 1, A.mt):
+            for n in range(k):
+                DGEMM(ctx, A.block(m, k), A.block(k, n), A.block(m, n), transa='N', transb='N', alpha=1.0, beta=1.0)
+        
+        # Step 3: Update A[k,n] for n < k
+        for n in range(k):
+            DTRSM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='N', diag=diag, alpha=1.0)
+        
+        # Step 4: Invert diagonal block A[k,k]
+        DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag)
+    
+    print(f"[PDTRTRI] Completed")
+
+
+def DLAAUM(ctx, a, uplo='L'):
+    """Compute A^T * A for a triangular block (lauum operation)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            # lauum: compute L * L^T for lower triangular L
+            if uplo == 'L':
+                L = cp.tril(d_block)
+                d_block[:] = L @ L.T
+            else:
+                U = cp.triu(d_block)
+                d_block[:] = U.T @ U
+
+
+def PDLAUUM(ctx, A, uplo='L'):
+    """Parallel tiled computation of A^T * A for lower triangular A"""
+    print(f"\n[PDLAUUM] Starting LAUUM (A^T * A)...")
+    assert uplo == 'L', "Only lower triangular LAUUM supported"
+    
+    for k in range(A.mt):
+        # Step 1: Update off-diagonal blocks
+        for n in range(k):
+            # Update A[n,n] with A[k,n]^T * A[k,n]
+            DSYRK(ctx, A.block(k, n), A.block(n, n), uplo='L', trans='T', alpha=1.0, beta=1.0)
+            
+            # Update A[m,n] with A[k,m]^T * A[k,n]
+            for m in range(n + 1, k):
+                DGEMM(ctx, A.block(k, m), A.block(k, n), A.block(m, n), transa='T', transb='N', alpha=1.0, beta=1.0)
+        
+        # Step 2: Update A[k,n] = A[k,k]^T * A[k,n]
+        for n in range(k):
+            DTRMM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='T', diag='N', alpha=1.0)
+        
+        # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k]
+        DLAAUM(ctx, A.block(k, k), uplo=uplo)
+    
+    print(f"[PDLAUUM] Completed")
+
+
+def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0):
+    """Parallel tiled matrix multiplication"""
+    print(f"\n[PDGEMM] Starting matrix multiplication...")
+    
+    for m in range(C.mt):
+        for n in range(C.nt):
+            inner_k = A.nt if transa == 'N' else A.mt
+            
+            if alpha == 0.0 or inner_k == 0:
+                # Just scale C
+                DGEMM(ctx, A.block(0, 0), B.block(0, 0), C.block(m, n), transa=transa, transb=transb, alpha=0.0, beta=beta)
+            elif transa == 'N':
+                if transb == 'N':
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta)
+                else:
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A.block(m, k), B.block(n, k), C.block(m, n), transa='N', transb='T', alpha=alpha, beta=zbeta)
+            else:  # transa in ['T', 'C']
+                if transb == 'N':
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta)
+                else:
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(ctx, A.block(k, m), B.block(n, k), C.block(m, n), transa='T', transb='T', alpha=alpha, beta=zbeta)
+    
+    print(f"[PDGEMM] Completed")
+
+
+def PDSYMM(ctx, A, B, C, side='L', uplo='L', alpha=1.0, beta=1.0):
+    """Parallel tiled symmetric matrix multiplication"""
+    print(f"\n[PDSYMM] Starting symmetric matrix multiplication...")
+    
+    for m in range(C.mt):
+        for n in range(C.nt):
+            if side == 'L':
+                if uplo == 'L':
+                    for k in range(C.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        if k < m:
+                            DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta)
+                        else:
+                            if k == m:
+                                DSYMM(ctx, A.block(k, k), B.block(k, n), C.block(m, n), side=side, uplo=uplo, alpha=alpha, beta=zbeta)
+                            else:
+                                DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta)
+            else:  # side == 'R'
+                # Similar logic for right multiplication
+                pass
+    
+    print(f"[PDSYMM] Completed")
+
+
+def compute_norm(ctx, matrix):
+    """Compute Frobenius norm of matrix using host tasks"""
+    norm_sq = 0.0
+    
+    for colb in range(matrix.nt):
+        low_rowb = colb if matrix.sym_matrix else 0
+        for rowb in range(low_rowb, matrix.mt):
+            handle = matrix.handle(rowb, colb)
+            
+            # Host task to read the block and compute norm
+            def compute_block_norm(h_block):
+                nonlocal norm_sq
+                norm_sq += np.sum(h_block * h_block)
+            
+            with ctx.task(stf.exec_place.host(), handle.read()) as t:
+                # Synchronize the stream before reading data
+                cp.cuda.runtime.streamSynchronize(t.stream_ptr())
+                
+                h_block = cai_to_numpy(t.get_arg_cai(0))
+                compute_block_norm(h_block)
+    
+    return np.sqrt(norm_sq)
+
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Tiled POTRI (matrix inversion via Cholesky) with CUDA STF')
+    parser.add_argument('N', type=int, nargs='?', default=512, help='Matrix size (default: 512)')
+    parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)')
+    parser.add_argument('--check', action='store_true', help='Check result (slower)')
+    args = parser.parse_args()
+    
+    N = args.N
+    NB = args.NB
+    check_result = args.check
+    
+    assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
+    
+    print("="*60)
+    print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy")
+    print("="*60)
+    print(f"Matrix size: {N}x{N}")
+    print(f"Block size: {NB}x{NB}")
+    print(f"Number of blocks: {N//NB}x{N//NB}")
+    print(f"Check result: {check_result}")
+    print("="*60)
+    
+    # Create STF context
+    ctx = stf.context()
+    
+    # Create matrices
+    A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
+    
+    if check_result:
+        Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
+    
+    print("\n" + "="*60)
+    print("Initializing matrices...")
+    print("="*60)
+    
+    # Hilbert matrix + diagonal dominance for numerical stability
+    def hilbert(row, col):
+        return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row)
+    
+    A.fill(hilbert)
+    if check_result:
+        Aref.fill(hilbert)
+    
+    # Measure performance
+    import time
+    start_time = time.time()
+    
+    print("\n" + "="*60)
+    print("Performing POTRI (inversion via Cholesky)...")
+    print("="*60)
+    
+    # Step 1: Cholesky factorization A = L*L^T
+    PDPOTRF(ctx, A, uplo='L')
+    
+    # Step 2: Triangular inversion L^(-1)
+    PDTRTRI(ctx, A, uplo='L', diag='N')
+    
+    # Step 3: Compute A^(-1) = L^(-T) * L^(-1)
+    PDLAUUM(ctx, A, uplo='L')
+    
+    if check_result:
+        print("\n" + "="*60)
+        print("Verifying result...")
+        print("="*60)
+        
+        # Create test vector B
+        B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri")
+        Bref_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri")
+        
+        def rhs_vals(row, col):
+            return 1.0 * (row + 1)
+        
+        B_potri.fill(rhs_vals)
+        Bref_potri.fill(rhs_vals)
+        
+        # Compute norm of B
+        b_norm = compute_norm(ctx, Bref_potri)
+        
+        # Create temporary matrix for result
+        B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp")
+        
+        def zero_vals(row, col):
+            return 0.0
+        
+        B_tmp.fill(zero_vals)
+        
+        # Compute B_tmp = A^(-1) * B
+        PDSYMM(ctx, A, B_potri, B_tmp, side='L', uplo='L', alpha=1.0, beta=0.0)
+        
+        # Compute residual: Bref = Aref * B_tmp - Bref
+        PDGEMM(ctx, Aref, B_tmp, Bref_potri, transa='N', transb='N', alpha=1.0, beta=-1.0)
+        
+        # Compute residual norm
+        res_norm = compute_norm(ctx, Bref_potri)
+    
+    print("\n" + "="*60)
+    print("Finalizing STF context...")
+    print("="*60)
+    ctx.finalize()
+    
+    end_time = time.time()
+    elapsed_ms = (end_time - start_time) * 1000.0
+    
+    # Compute FLOPS for POTRI
+    # POTRF: (1/3) * N^3
+    # TRTRI: (1/3) * N^3  
+    # LAUUM: (1/3) * N^3
+    # Total: N^3
+    flops = float(N) ** 3
+    gflops = flops / (elapsed_ms / 1000.0) / 1e9
+    
+    print("\n" + "="*60)
+    print("Results")
+    print("="*60)
+    print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms")
+    print(f"[POTRI] Performance: {gflops:.2f} GFLOPS")
+    
+    if check_result:
+        residual = res_norm / b_norm
+        print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}")
+        print(f"[POTRI] ||B||: {b_norm:.6e}")
+        print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}")
+        
+        if residual < 0.01:
+            print("\n✅ Algorithm converged successfully!")
+            return 0
+        else:
+            print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)")
+            return 1
+    
+    print("="*60)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+

From 80e1085ce4f3507df4e2e5ef9dc250c87f0e399f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 9 Oct 2025 15:38:30 +0200
Subject: [PATCH 198/221] clang-format

---
 .../cuda_cccl/tests/stf/example_cholesky.py   | 229 +++++++-----------
 1 file changed, 82 insertions(+), 147 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py
index faef64d1d73..1dac4b5a4fa 100755
--- a/python/cuda_cccl/tests/stf/example_cholesky.py
+++ b/python/cuda_cccl/tests/stf/example_cholesky.py
@@ -71,6 +71,28 @@ def cai_to_numpy(cai_dict):
     return arr
 
 
+class BlockRef:
+    """Reference to a specific block in a tiled matrix."""
+
+    def __init__(self, matrix, row, col):
+        self.matrix = matrix
+        self.row = row
+        self.col = col
+        self._handle = matrix.handle(row, col)
+        self._devid = matrix.get_preferred_devid(row, col)
+
+    def handle(self):
+        """Get the STF logical data handle for this block."""
+        return self._handle
+
+    def devid(self):
+        """Get the preferred device ID for this block."""
+        return self._devid
+
+    def __repr__(self):
+        return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])"
+
+
 class TiledMatrix:
     """
     Tiled matrix class that splits a matrix into blocks for parallel processing.
@@ -165,6 +187,10 @@ def handle(self, row, col):
         """Get the logical data handle for block (row, col)"""
         return self.handles[(row, col)]
 
+    def block(self, row, col):
+        """Get a BlockRef for block (row, col)"""
+        return BlockRef(self, row, col)
+
     def _get_index(self, row, col):
         """Convert (row, col) to linear index in tiled storage"""
         # Find which tile contains this element
@@ -213,37 +239,19 @@ def fill(self, func):
 # BLAS/LAPACK operations wrapped in STF tasks
 
 
-def DPOTRF(ctx, A, row, col):
-    """Cholesky factorization of block (row, col) using CUSOLVER"""
-    handle = A.handle(row, col)
-    devid = A.get_preferred_devid(row, col)
-
-    with ctx.task(stf.exec_place.device(devid), handle.rw()) as t:
+def DPOTRF(ctx, a):
+    """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
         d_block = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
             d_block[:] = cp.linalg.cholesky(d_block)
 
 
-def DTRSM(
-    ctx,
-    A,
-    a_row,
-    a_col,
-    B,
-    b_row,
-    b_col,
-    side="L",
-    uplo="L",
-    transa="T",
-    diag="N",
-    alpha=1.0,
-):
+def DTRSM(ctx, a, b, side="L", uplo="L", transa="T", diag="N", alpha=1.0):
     """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}"""
-    handle_a = A.handle(a_row, a_col)
-    handle_b = B.handle(b_row, b_col)
-    devid = B.get_preferred_devid(b_row, b_col)
-
-    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_b.rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
         d_a, d_b = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
             if side == "L":
@@ -266,30 +274,13 @@ def DTRSM(
                     d_b *= alpha
 
 
-def DGEMM(
-    ctx,
-    A,
-    a_row,
-    a_col,
-    B,
-    b_row,
-    b_col,
-    C,
-    c_row,
-    c_col,
-    transa="N",
-    transb="N",
-    alpha=1.0,
-    beta=1.0,
-):
+def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0):
     """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C"""
-    handle_a = A.handle(a_row, a_col)
-    handle_b = B.handle(b_row, b_col)
-    handle_c = C.handle(c_row, c_col)
-    devid = C.get_preferred_devid(c_row, c_col)
-
     with ctx.task(
-        stf.exec_place.device(devid), handle_a.read(), handle_b.read(), handle_c.rw()
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
     ) as t:
         d_a, d_b, d_c = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
@@ -304,15 +295,11 @@ def DGEMM(
                 d_c[:] = alpha * (op_a @ op_b) + beta * d_c
 
 
-def DSYRK(
-    ctx, A, a_row, a_col, C, c_row, c_col, uplo="L", trans="N", alpha=1.0, beta=1.0
-):
+def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0):
     """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
-    handle_a = A.handle(a_row, a_col)
-    handle_c = C.handle(c_row, c_col)
-    devid = C.get_preferred_devid(c_row, c_col)
-
-    with ctx.task(stf.exec_place.device(devid), handle_a.read(), handle_c.rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()
+    ) as t:
         d_a, d_c = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
             op_a = d_a.T if trans != "N" else d_a
@@ -340,18 +327,14 @@ def PDPOTRF(ctx, A):
 
     for k in range(nblocks):
         # Factor diagonal block
-        DPOTRF(ctx, A, k, k)
+        DPOTRF(ctx, A.block(k, k))
 
         # Solve triangular systems for blocks in column k
         for row in range(k + 1, nblocks):
             DTRSM(
                 ctx,
-                A,
-                k,
-                k,
-                A,
-                row,
-                k,
+                A.block(k, k),
+                A.block(row, k),
                 side="R",
                 uplo="L",
                 transa="T",
@@ -363,15 +346,9 @@ def PDPOTRF(ctx, A):
             for col in range(k + 1, row):
                 DGEMM(
                     ctx,
-                    A,
-                    row,
-                    k,
-                    A,
-                    col,
-                    k,
-                    A,
-                    row,
-                    col,
+                    A.block(row, k),
+                    A.block(col, k),
+                    A.block(row, col),
                     transa="N",
                     transb="T",
                     alpha=-1.0,
@@ -380,7 +357,13 @@ def PDPOTRF(ctx, A):
 
             # Symmetric rank-k update of diagonal block
             DSYRK(
-                ctx, A, row, k, A, row, row, uplo="L", trans="N", alpha=-1.0, beta=1.0
+                ctx,
+                A.block(row, k),
+                A.block(row, row),
+                uplo="L",
+                trans="N",
+                alpha=-1.0,
+                beta=1.0,
             )
 
     print("[PDPOTRF] Completed")
@@ -399,12 +382,8 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
                     for n in range(B.nt):
                         DTRSM(
                             ctx,
-                            A,
-                            k,
-                            k,
-                            B,
-                            k,
-                            n,
+                            A.block(k, k),
+                            B.block(k, n),
                             side="L",
                             uplo="L",
                             transa="N",
@@ -415,15 +394,9 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
                         for n in range(B.nt):
                             DGEMM(
                                 ctx,
-                                A,
-                                m,
-                                k,
-                                B,
-                                k,
-                                n,
-                                B,
-                                m,
-                                n,
+                                A.block(m, k),
+                                B.block(k, n),
+                                B.block(m, n),
                                 transa="N",
                                 transb="N",
                                 alpha=-1.0,
@@ -433,15 +406,12 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
                 # Backward substitution
                 for k in range(B.mt):
                     lalpha = alpha if k == 0 else 1.0
+                    row_idx = B.mt - k - 1
                     for n in range(B.nt):
                         DTRSM(
                             ctx,
-                            A,
-                            B.mt - k - 1,
-                            B.mt - k - 1,
-                            B,
-                            B.mt - k - 1,
-                            n,
+                            A.block(row_idx, row_idx),
+                            B.block(row_idx, n),
                             side="L",
                             uplo="L",
                             transa="T",
@@ -449,18 +419,13 @@ def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
                             alpha=lalpha,
                         )
                     for m in range(k + 1, B.mt):
+                        m_idx = B.mt - 1 - m
                         for n in range(B.nt):
                             DGEMM(
                                 ctx,
-                                A,
-                                B.mt - k - 1,
-                                B.mt - 1 - m,
-                                B,
-                                B.mt - k - 1,
-                                n,
-                                B,
-                                B.mt - 1 - m,
-                                n,
+                                A.block(row_idx, m_idx),
+                                B.block(row_idx, n),
+                                B.block(m_idx, n),
                                 transa="T",
                                 transb="N",
                                 alpha=-1.0,
@@ -513,15 +478,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
                 # Just scale C
                 DGEMM(
                     ctx,
-                    A,
-                    0,
-                    0,
-                    B,
-                    0,
-                    0,
-                    C,
-                    m,
-                    n,
+                    A.block(0, 0),
+                    B.block(0, 0),
+                    C.block(m, n),
                     transa=transa,
                     transb=transb,
                     alpha=0.0,
@@ -533,15 +492,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
                         zbeta = beta if k == 0 else 1.0
                         DGEMM(
                             ctx,
-                            A,
-                            m,
-                            k,
-                            B,
-                            k,
-                            n,
-                            C,
-                            m,
-                            n,
+                            A.block(m, k),
+                            B.block(k, n),
+                            C.block(m, n),
                             transa="N",
                             transb="N",
                             alpha=alpha,
@@ -552,15 +505,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
                         zbeta = beta if k == 0 else 1.0
                         DGEMM(
                             ctx,
-                            A,
-                            m,
-                            k,
-                            B,
-                            n,
-                            k,
-                            C,
-                            m,
-                            n,
+                            A.block(m, k),
+                            B.block(n, k),
+                            C.block(m, n),
                             transa="N",
                             transb="T",
                             alpha=alpha,
@@ -572,15 +519,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
                         zbeta = beta if k == 0 else 1.0
                         DGEMM(
                             ctx,
-                            A,
-                            k,
-                            m,
-                            B,
-                            k,
-                            n,
-                            C,
-                            m,
-                            n,
+                            A.block(k, m),
+                            B.block(k, n),
+                            C.block(m, n),
                             transa="T",
                             transb="N",
                             alpha=alpha,
@@ -591,15 +532,9 @@ def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
                         zbeta = beta if k == 0 else 1.0
                         DGEMM(
                             ctx,
-                            A,
-                            k,
-                            m,
-                            B,
-                            n,
-                            k,
-                            C,
-                            m,
-                            n,
+                            A.block(k, m),
+                            B.block(n, k),
+                            C.block(m, n),
                             transa="T",
                             transb="T",
                             alpha=alpha,

From 4c1551ab80c84fc50895c500f6cb2b5334c4a402 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Thu, 9 Oct 2025 21:45:21 +0200
Subject: [PATCH 199/221] how changes to numba-cuda have been merged

---
 python/cuda_cccl/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 14561449098..f8dd51338d8 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
   "numpy",
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
-  "numba-cuda @ git+https://github.com/caugonnet/numba-cuda.git@cuda_graph_future_memory",
+  "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@main",
 ]
 
 dynamic = ["version"]

From de333b28f2599deb8a61dd7a416633c3700da6c6 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Fri, 14 Nov 2025 13:48:06 -0500
Subject: [PATCH 200/221] Fix CI precommit

---
 python/cuda_cccl/CMakeLists.txt               |  63 +-
 .../cuda_cccl/tests/stf/example_cholesky.py   |   2 +-
 python/cuda_cccl/tests/stf/example_potri.py   | 595 ++++++++++++------
 3 files changed, 442 insertions(+), 218 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 2c8cc0e43e4..e8ce363b6e7 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -25,10 +25,10 @@ cccl_build_compiler_targets()
 
 # Build and install C++ library first
 set(CCCL_ENABLE_C_PARALLEL ON)
-set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)     # Enable C experimental STF library (triggers c/ directory)
-set(CCCL_C_PARALLEL_ENABLE_TESTING OFF)    # Testing belongs in CI, not Python build
-set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF)  # Testing belongs in CI, not Python build
-set(CCCL_ENABLE_UNSTABLE ON)               # Enable unstable features
+set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory)
+set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build
+set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build
+set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 
@@ -60,13 +60,13 @@ file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl")
 
 # Install version-specific binaries
 install(
-    TARGETS cccl.c.experimental.stf
-    DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl
+  TARGETS cccl.c.experimental.stf
+  DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl
 )
 
 install(
-    TARGETS cccl.c.parallel
-    DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl
+  TARGETS cccl.c.parallel
+  DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl
 )
 
 # Build and install Cython extension
@@ -138,20 +138,30 @@ add_custom_target(
 )
 
 message(STATUS "STF Using Cython ${CYTHON_VERSION}")
-set(stf_pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx")
+set(
+  stf_pyx_source_file
+  "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx"
+)
 set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c")
 set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep")
 add_custom_command(
-    OUTPUT "${_stf_generated_extension_src}"
-    COMMAND "${Python3_EXECUTABLE}" -m cython
-    ARGS ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file ${_stf_generated_extension_src}
-    DEPENDS "${stf_pyx_source_file}"
-    DEPFILE "${_stf_depfile}"
-    COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
+  OUTPUT "${_stf_generated_extension_src}"
+  COMMAND "${Python3_EXECUTABLE}" -m cython
+  ARGS
+    ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file
+    ${_stf_generated_extension_src}
+  DEPENDS "${stf_pyx_source_file}"
+  DEPFILE "${_stf_depfile}"
+  COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
 )
-set_source_files_properties("${_stf_generated_extension_src}" PROPERTIES GENERATED TRUE)
-add_custom_target(cythonize_stf_bindings_impl ALL
-    DEPENDS "${_stf_generated_extension_src}"
+set_source_files_properties(
+  "${_stf_generated_extension_src}"
+  PROPERTIES GENERATED TRUE
+)
+add_custom_target(
+  cythonize_stf_bindings_impl
+  ALL
+  DEPENDS "${_stf_generated_extension_src}"
 )
 
 python3_add_library(
@@ -170,10 +180,21 @@ target_link_libraries(
 )
 set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
 
-Python3_add_library(_stf_bindings_impl MODULE WITH_SOABI "${_stf_generated_extension_src}")
+python3_add_library(
+  _stf_bindings_impl
+  MODULE
+  WITH_SOABI
+  "${_stf_generated_extension_src}"
+)
 add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl)
-target_link_libraries(_stf_bindings_impl PRIVATE cccl.c.experimental.stf CUDA::cuda_driver)
-set_target_properties(_stf_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
+target_link_libraries(
+  _stf_bindings_impl
+  PRIVATE cccl.c.experimental.stf CUDA::cuda_driver
+)
+set_target_properties(
+  _stf_bindings_impl
+  PROPERTIES INSTALL_RPATH "$ORIGIN/cccl"
+)
 
 install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR})
 install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR})
diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py
index 1dac4b5a4fa..7eded4a20b7 100755
--- a/python/cuda_cccl/tests/stf/example_cholesky.py
+++ b/python/cuda_cccl/tests/stf/example_cholesky.py
@@ -41,7 +41,7 @@ def get_cupy_arrays(task):
         try:
             arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
             idx += 1
-        except:
+        except Exception:
             break
     return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
 
diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py
index 0be70fc9338..1e3c721a9c1 100644
--- a/python/cuda_cccl/tests/stf/example_potri.py
+++ b/python/cuda_cccl/tests/stf/example_potri.py
@@ -15,14 +15,17 @@
 """
 
 import sys
-import numpy as np
+
 import cupy as cp
+import numpy as np
 from cupyx.scipy import linalg as cp_linalg
+
 import cuda.stf as stf
 
 
 class CAIWrapper:
     """Wrapper to expose CUDA Array Interface dict as a proper CAI object."""
+
     def __init__(self, cai_dict):
         self.__cuda_array_interface__ = cai_dict
 
@@ -30,7 +33,7 @@ def __init__(self, cai_dict):
 def get_cupy_arrays(task):
     """
     Get all CuPy arrays from STF task arguments.
-    
+
     Usage:
         d_a, d_b, d_c = get_cupy_arrays(t)
     """
@@ -40,7 +43,7 @@ def get_cupy_arrays(task):
         try:
             arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
             idx += 1
-        except:
+        except Exception:
             break
     return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
 
@@ -48,45 +51,46 @@ def get_cupy_arrays(task):
 def cai_to_numpy(cai_dict):
     """Convert CUDA Array Interface dict to NumPy array (for host memory)."""
     import ctypes
-    
+
     # Extract CAI fields
-    data_ptr, readonly = cai_dict['data']
-    shape = cai_dict['shape']
-    typestr = cai_dict['typestr']
-    
+    data_ptr, readonly = cai_dict["data"]
+    shape = cai_dict["shape"]
+    typestr = cai_dict["typestr"]
+
     # Convert typestr to NumPy dtype
     dtype = np.dtype(typestr)
-    
+
     # Calculate total size in bytes
     itemsize = dtype.itemsize
     size = np.prod(shape) * itemsize
-    
+
     # Create ctypes buffer from pointer
     buffer = (ctypes.c_byte * size).from_address(data_ptr)
-    
+
     # Create NumPy array from buffer
     arr = np.frombuffer(buffer, dtype=dtype).reshape(shape)
-    
+
     return arr
 
 
 class BlockRef:
     """Reference to a specific block in a tiled matrix."""
+
     def __init__(self, matrix, row, col):
         self.matrix = matrix
         self.row = row
         self.col = col
         self._handle = matrix.handle(row, col)
         self._devid = matrix.get_preferred_devid(row, col)
-    
+
     def handle(self):
         """Get the STF logical data handle for this block."""
         return self._handle
-    
+
     def devid(self):
         """Get the preferred device ID for this block."""
         return self._devid
-    
+
     def __repr__(self):
         return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])"
 
@@ -97,25 +101,39 @@ class TiledMatrix:
     Each block is managed as an STF logical data object.
     Uses tiled storage format for contiguous blocks.
     """
-    def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols, 
-                 is_symmetric=False, symbol="matrix", dtype=np.float64):
+
+    def __init__(
+        self,
+        ctx,
+        nrows,
+        ncols,
+        blocksize_rows,
+        blocksize_cols,
+        is_symmetric=False,
+        symbol="matrix",
+        dtype=np.float64,
+    ):
         self.ctx = ctx
         self.symbol = symbol
         self.dtype = dtype
         self.sym_matrix = is_symmetric
-        
+
         self.m = nrows
         self.n = ncols
         self.mb = blocksize_rows
         self.nb = blocksize_cols
-        
-        assert self.m % self.mb == 0, f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}"
-        assert self.n % self.nb == 0, f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}"
-        
+
+        assert self.m % self.mb == 0, (
+            f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}"
+        )
+        assert self.n % self.nb == 0, (
+            f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}"
+        )
+
         # Number of blocks
         self.mt = self.m // self.mb
         self.nt = self.n // self.nb
-        
+
         # Allocate pinned host memory for faster transfers (in tiled format)
         self.h_array = cp.cuda.alloc_pinned_memory(
             self.m * self.n * np.dtype(dtype).itemsize
@@ -123,17 +141,21 @@ def __init__(self, ctx, nrows, ncols, blocksize_rows, blocksize_cols,
         self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(
             self.m, self.n
         )
-        
+
         # Dictionary to store logical data handles for each block
         self.handles = {}
-        
+
         # Determine device layout
         self.ndevs = cp.cuda.runtime.getDeviceCount()
         self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
-        
-        print(f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}")
-        print(f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid")
-    
+
+        print(
+            f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}"
+        )
+        print(
+            f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid"
+        )
+
     def _compute_device_grid(self, ndevs):
         """Compute 2D device grid dimensions (as close to square as possible)"""
         grid_p = 1
@@ -143,19 +165,19 @@ def _compute_device_grid(self, ndevs):
                 grid_p = a
                 grid_q = ndevs // a
         return grid_p, grid_q
-    
+
     def get_preferred_devid(self, row, col):
         """Get preferred device ID for a given block using cyclic distribution"""
         return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
-    
+
     def handle(self, row, col):
         """Get the logical data handle for a block."""
         return self.handles[(row, col)]
-    
+
     def block(self, row, col):
         """Get a BlockRef for block (row, col)"""
         return BlockRef(self, row, col)
-    
+
     def _get_index(self, row, col):
         """Convert (row, col) to linear index in tiled storage"""
         tile_row = row // self.mb
@@ -164,7 +186,7 @@ def _get_index(self, row, col):
         tile_start = (tile_row + self.mt * tile_col) * tile_size
         offset = (row % self.mb) + (col % self.nb) * self.mb
         return tile_start + offset
-    
+
     def _get_block_h(self, brow, bcol):
         """Get a view of the host data for block (brow, bcol)"""
         # For tiled storage, blocks are stored contiguously
@@ -172,7 +194,7 @@ def _get_block_h(self, brow, bcol):
         end_idx = start_idx + self.mb * self.nb
         flat_view = self.h_array_np.ravel()
         return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
-    
+
     def fill(self, func):
         """
         Fill the matrix blocks using a function func(row, col) -> value.
@@ -189,10 +211,10 @@ def fill(self, func):
                         row = lrow + rowb * self.mb
                         col = lcol + colb * self.nb
                         h_block[lrow, lcol] = func(row, col)
-                
+
                 handle = self.ctx.logical_data(h_block)
                 handle.set_symbol(f"{self.symbol}_{rowb}_{colb}")
-                
+
                 self.handles[(rowb, colb)] = handle
 
 
@@ -200,6 +222,7 @@ def fill(self, func):
 # Block-level operations (BLAS/LAPACK)
 # ============================================================================
 
+
 def DPOTRF(ctx, a):
     """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)"""
     with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
@@ -208,13 +231,15 @@ def DPOTRF(ctx, a):
             d_block[:] = cp.linalg.cholesky(d_block)
 
 
-def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0):
+def DTRSM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0):
     """Triangular solve: B = alpha * op(A)^(-1) * B"""
-    with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
         d_a, d_b = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
-            lower = (uplo == 'L')
-            trans = (transa != 'N')
+            lower = uplo == "L"
+            trans = transa != "N"
             result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans)
             if alpha != 1.0:
                 d_b[:] = alpha * result
@@ -222,27 +247,34 @@ def DTRSM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0):
                 d_b[:] = result
 
 
-def DTRTRI(ctx, a, uplo='L', diag='N'):
+def DTRTRI(ctx, a, uplo="L", diag="N"):
     """Triangular matrix inversion: A = A^(-1)"""
     with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
         d_block = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
-            lower = (uplo == 'L')
-            unit_diagonal = (diag == 'U')
+            lower = uplo == "L"
+            unit_diagonal = diag == "U"
             # CuPy doesn't have trtri directly, use solve with identity
             n = d_block.shape[0]
             identity = cp.eye(n, dtype=d_block.dtype)
-            d_block[:] = cp_linalg.solve_triangular(d_block, identity, lower=lower, unit_diagonal=unit_diagonal)
+            d_block[:] = cp_linalg.solve_triangular(
+                d_block, identity, lower=lower, unit_diagonal=unit_diagonal
+            )
 
 
-def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0):
+def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0):
     """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C"""
-    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
+    ) as t:
         d_a, d_b, d_c = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
-            op_a = d_a.T if transa != 'N' else d_a
-            op_b = d_b.T if transb != 'N' else d_b
-            
+            op_a = d_a.T if transa != "N" else d_a
+            op_b = d_b.T if transb != "N" else d_b
+
             if beta == 0.0:
                 d_c[:] = alpha * (op_a @ op_b)
             elif beta == 1.0:
@@ -251,13 +283,15 @@ def DGEMM(ctx, a, b, c, transa='N', transb='N', alpha=1.0, beta=1.0):
                 d_c[:] = alpha * (op_a @ op_b) + beta * d_c
 
 
-def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0):
+def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0):
     """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
-    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()
+    ) as t:
         d_a, d_c = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
-            op_a = d_a.T if trans != 'N' else d_a
-            
+            op_a = d_a.T if trans != "N" else d_a
+
             if beta == 0.0:
                 d_c[:] = alpha * (op_a @ op_a.T)
             elif beta == 1.0:
@@ -266,48 +300,55 @@ def DSYRK(ctx, a, c, uplo='L', trans='N', alpha=1.0, beta=1.0):
                 d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c
 
 
-def DTRMM(ctx, a, b, side='L', uplo='L', transa='N', diag='N', alpha=1.0):
+def DTRMM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0):
     """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')"""
-    with ctx.task(stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
         d_a, d_b = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
-            lower = (uplo == 'L')
-            trans = (transa != 'N')
-            
+            lower = uplo == "L"
+            trans = transa != "N"
+
             # Extract triangle from A
             if lower:
                 tri_a = cp.tril(d_a)
             else:
                 tri_a = cp.triu(d_a)
-            
+
             if trans:
                 tri_a = tri_a.T
-            
-            if side == 'L':
+
+            if side == "L":
                 d_b[:] = alpha * (tri_a @ d_b)
             else:  # side == 'R'
                 d_b[:] = alpha * (d_b @ tri_a)
 
 
-def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0):
+def DSYMM(ctx, a, b, c, side="L", uplo="L", alpha=1.0, beta=1.0):
     """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R')
     where A is symmetric."""
-    with ctx.task(stf.exec_place.device(c.devid()), a.handle().read(), b.handle().read(), c.handle().rw()) as t:
+    with ctx.task(
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
+    ) as t:
         d_a, d_b, d_c = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
             # Reconstruct full symmetric matrix from lower/upper triangle
-            if uplo == 'L':
+            if uplo == "L":
                 # Lower triangle is stored
                 sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T
             else:
                 # Upper triangle is stored
                 sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T
-            
-            if side == 'L':
+
+            if side == "L":
                 result = alpha * (sym_a @ d_b)
             else:  # side == 'R'
                 result = alpha * (d_b @ sym_a)
-            
+
             if beta == 0.0:
                 d_c[:] = result
             elif beta == 1.0:
@@ -320,61 +361,115 @@ def DSYMM(ctx, a, b, c, side='L', uplo='L', alpha=1.0, beta=1.0):
 # Tiled operations
 # ============================================================================
 
-def PDPOTRF(ctx, A, uplo='L'):
+
+def PDPOTRF(ctx, A, uplo="L"):
     """Parallel tiled Cholesky factorization"""
-    print(f"\n[PDPOTRF] Starting Cholesky factorization...")
-    assert uplo == 'L', "Only lower triangular factorization supported"
-    
+    print("\n[PDPOTRF] Starting Cholesky factorization...")
+    assert uplo == "L", "Only lower triangular factorization supported"
+
     for k in range(A.nt):
         # Factorize diagonal block
         DPOTRF(ctx, A.block(k, k))
-        
+
         # Update column below diagonal
         for m in range(k + 1, A.mt):
-            DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='T', diag='N', alpha=1.0)
-        
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(m, k),
+                side="R",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
         # Update trailing submatrix
         for n in range(k + 1, A.nt):
-            DSYRK(ctx, A.block(n, k), A.block(n, n), uplo='L', trans='N', alpha=-1.0, beta=1.0)
-            
+            DSYRK(
+                ctx,
+                A.block(n, k),
+                A.block(n, n),
+                uplo="L",
+                trans="N",
+                alpha=-1.0,
+                beta=1.0,
+            )
+
             for m in range(n + 1, A.mt):
-                DGEMM(ctx, A.block(m, k), A.block(n, k), A.block(m, n), transa='N', transb='T', alpha=-1.0, beta=1.0)
-    
-    print(f"[PDPOTRF] Completed")
+                DGEMM(
+                    ctx,
+                    A.block(m, k),
+                    A.block(n, k),
+                    A.block(m, n),
+                    transa="N",
+                    transb="T",
+                    alpha=-1.0,
+                    beta=1.0,
+                )
+
+    print("[PDPOTRF] Completed")
 
 
-def PDTRTRI(ctx, A, uplo='L', diag='N'):
+def PDTRTRI(ctx, A, uplo="L", diag="N"):
     """Parallel tiled triangular matrix inversion"""
-    print(f"\n[PDTRTRI] Starting triangular inversion...")
-    assert uplo == 'L', "Only lower triangular inversion supported"
-    
+    print("\n[PDTRTRI] Starting triangular inversion...")
+    assert uplo == "L", "Only lower triangular inversion supported"
+
     for k in range(A.nt):
         # Step 1: Update A[m,k] for m > k
         for m in range(k + 1, A.mt):
-            DTRSM(ctx, A.block(k, k), A.block(m, k), side='R', uplo='L', transa='N', diag=diag, alpha=-1.0)
-        
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(m, k),
+                side="R",
+                uplo="L",
+                transa="N",
+                diag=diag,
+                alpha=-1.0,
+            )
+
         # Step 2: Update A[m,n] for m > k, n < k
         for m in range(k + 1, A.mt):
             for n in range(k):
-                DGEMM(ctx, A.block(m, k), A.block(k, n), A.block(m, n), transa='N', transb='N', alpha=1.0, beta=1.0)
-        
+                DGEMM(
+                    ctx,
+                    A.block(m, k),
+                    A.block(k, n),
+                    A.block(m, n),
+                    transa="N",
+                    transb="N",
+                    alpha=1.0,
+                    beta=1.0,
+                )
+
         # Step 3: Update A[k,n] for n < k
         for n in range(k):
-            DTRSM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='N', diag=diag, alpha=1.0)
-        
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(k, n),
+                side="L",
+                uplo="L",
+                transa="N",
+                diag=diag,
+                alpha=1.0,
+            )
+
         # Step 4: Invert diagonal block A[k,k]
         DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag)
-    
-    print(f"[PDTRTRI] Completed")
+
+    print("[PDTRTRI] Completed")
 
 
-def DLAAUM(ctx, a, uplo='L'):
+def DLAAUM(ctx, a, uplo="L"):
     """Compute A^T * A for a triangular block (lauum operation)"""
     with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
         d_block = get_cupy_arrays(t)
         with cp.cuda.ExternalStream(t.stream_ptr()):
             # lauum: compute L * L^T for lower triangular L
-            if uplo == 'L':
+            if uplo == "L":
                 L = cp.tril(d_block)
                 d_block[:] = L @ L.T
             else:
@@ -382,248 +477,356 @@ def DLAAUM(ctx, a, uplo='L'):
                 d_block[:] = U.T @ U
 
 
-def PDLAUUM(ctx, A, uplo='L'):
+def PDLAUUM(ctx, A, uplo="L"):
     """Parallel tiled computation of A^T * A for lower triangular A"""
-    print(f"\n[PDLAUUM] Starting LAUUM (A^T * A)...")
-    assert uplo == 'L', "Only lower triangular LAUUM supported"
-    
+    print("\n[PDLAUUM] Starting LAUUM (A^T * A)...")
+    assert uplo == "L", "Only lower triangular LAUUM supported"
+
     for k in range(A.mt):
         # Step 1: Update off-diagonal blocks
         for n in range(k):
             # Update A[n,n] with A[k,n]^T * A[k,n]
-            DSYRK(ctx, A.block(k, n), A.block(n, n), uplo='L', trans='T', alpha=1.0, beta=1.0)
-            
+            DSYRK(
+                ctx,
+                A.block(k, n),
+                A.block(n, n),
+                uplo="L",
+                trans="T",
+                alpha=1.0,
+                beta=1.0,
+            )
+
             # Update A[m,n] with A[k,m]^T * A[k,n]
             for m in range(n + 1, k):
-                DGEMM(ctx, A.block(k, m), A.block(k, n), A.block(m, n), transa='T', transb='N', alpha=1.0, beta=1.0)
-        
+                DGEMM(
+                    ctx,
+                    A.block(k, m),
+                    A.block(k, n),
+                    A.block(m, n),
+                    transa="T",
+                    transb="N",
+                    alpha=1.0,
+                    beta=1.0,
+                )
+
         # Step 2: Update A[k,n] = A[k,k]^T * A[k,n]
         for n in range(k):
-            DTRMM(ctx, A.block(k, k), A.block(k, n), side='L', uplo='L', transa='T', diag='N', alpha=1.0)
-        
+            DTRMM(
+                ctx,
+                A.block(k, k),
+                A.block(k, n),
+                side="L",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
         # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k]
         DLAAUM(ctx, A.block(k, k), uplo=uplo)
-    
-    print(f"[PDLAUUM] Completed")
+
+    print("[PDLAUUM] Completed")
 
 
-def PDGEMM(ctx, A, B, C, transa='N', transb='N', alpha=1.0, beta=1.0):
+def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
     """Parallel tiled matrix multiplication"""
-    print(f"\n[PDGEMM] Starting matrix multiplication...")
-    
+    print("\n[PDGEMM] Starting matrix multiplication...")
+
     for m in range(C.mt):
         for n in range(C.nt):
-            inner_k = A.nt if transa == 'N' else A.mt
-            
+            inner_k = A.nt if transa == "N" else A.mt
+
             if alpha == 0.0 or inner_k == 0:
                 # Just scale C
-                DGEMM(ctx, A.block(0, 0), B.block(0, 0), C.block(m, n), transa=transa, transb=transb, alpha=0.0, beta=beta)
-            elif transa == 'N':
-                if transb == 'N':
+                DGEMM(
+                    ctx,
+                    A.block(0, 0),
+                    B.block(0, 0),
+                    C.block(m, n),
+                    transa=transa,
+                    transb=transb,
+                    alpha=0.0,
+                    beta=beta,
+                )
+            elif transa == "N":
+                if transb == "N":
                     for k in range(A.nt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="N",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
                 else:
                     for k in range(A.nt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A.block(m, k), B.block(n, k), C.block(m, n), transa='N', transb='T', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="N",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
             else:  # transa in ['T', 'C']
-                if transb == 'N':
+                if transb == "N":
                     for k in range(A.mt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta)
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="T",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
                 else:
                     for k in range(A.mt):
                         zbeta = beta if k == 0 else 1.0
-                        DGEMM(ctx, A.block(k, m), B.block(n, k), C.block(m, n), transa='T', transb='T', alpha=alpha, beta=zbeta)
-    
-    print(f"[PDGEMM] Completed")
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="T",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
 
+    print("[PDGEMM] Completed")
 
-def PDSYMM(ctx, A, B, C, side='L', uplo='L', alpha=1.0, beta=1.0):
+
+def PDSYMM(ctx, A, B, C, side="L", uplo="L", alpha=1.0, beta=1.0):
     """Parallel tiled symmetric matrix multiplication"""
-    print(f"\n[PDSYMM] Starting symmetric matrix multiplication...")
-    
+    print("\n[PDSYMM] Starting symmetric matrix multiplication...")
+
     for m in range(C.mt):
         for n in range(C.nt):
-            if side == 'L':
-                if uplo == 'L':
+            if side == "L":
+                if uplo == "L":
                     for k in range(C.mt):
                         zbeta = beta if k == 0 else 1.0
                         if k < m:
-                            DGEMM(ctx, A.block(m, k), B.block(k, n), C.block(m, n), transa='N', transb='N', alpha=alpha, beta=zbeta)
+                            DGEMM(
+                                ctx,
+                                A.block(m, k),
+                                B.block(k, n),
+                                C.block(m, n),
+                                transa="N",
+                                transb="N",
+                                alpha=alpha,
+                                beta=zbeta,
+                            )
                         else:
                             if k == m:
-                                DSYMM(ctx, A.block(k, k), B.block(k, n), C.block(m, n), side=side, uplo=uplo, alpha=alpha, beta=zbeta)
+                                DSYMM(
+                                    ctx,
+                                    A.block(k, k),
+                                    B.block(k, n),
+                                    C.block(m, n),
+                                    side=side,
+                                    uplo=uplo,
+                                    alpha=alpha,
+                                    beta=zbeta,
+                                )
                             else:
-                                DGEMM(ctx, A.block(k, m), B.block(k, n), C.block(m, n), transa='T', transb='N', alpha=alpha, beta=zbeta)
+                                DGEMM(
+                                    ctx,
+                                    A.block(k, m),
+                                    B.block(k, n),
+                                    C.block(m, n),
+                                    transa="T",
+                                    transb="N",
+                                    alpha=alpha,
+                                    beta=zbeta,
+                                )
             else:  # side == 'R'
                 # Similar logic for right multiplication
                 pass
-    
-    print(f"[PDSYMM] Completed")
+
+    print("[PDSYMM] Completed")
 
 
 def compute_norm(ctx, matrix):
     """Compute Frobenius norm of matrix using host tasks"""
     norm_sq = 0.0
-    
+
     for colb in range(matrix.nt):
         low_rowb = colb if matrix.sym_matrix else 0
         for rowb in range(low_rowb, matrix.mt):
             handle = matrix.handle(rowb, colb)
-            
+
             # Host task to read the block and compute norm
             def compute_block_norm(h_block):
                 nonlocal norm_sq
                 norm_sq += np.sum(h_block * h_block)
-            
+
             with ctx.task(stf.exec_place.host(), handle.read()) as t:
                 # Synchronize the stream before reading data
                 cp.cuda.runtime.streamSynchronize(t.stream_ptr())
-                
+
                 h_block = cai_to_numpy(t.get_arg_cai(0))
                 compute_block_norm(h_block)
-    
+
     return np.sqrt(norm_sq)
 
 
 def main():
     import argparse
-    
-    parser = argparse.ArgumentParser(description='Tiled POTRI (matrix inversion via Cholesky) with CUDA STF')
-    parser.add_argument('N', type=int, nargs='?', default=512, help='Matrix size (default: 512)')
-    parser.add_argument('NB', type=int, nargs='?', default=128, help='Block size (default: 128)')
-    parser.add_argument('--check', action='store_true', help='Check result (slower)')
+
+    parser = argparse.ArgumentParser(
+        description="Tiled POTRI (matrix inversion via Cholesky) with CUDA STF"
+    )
+    parser.add_argument(
+        "N", type=int, nargs="?", default=512, help="Matrix size (default: 512)"
+    )
+    parser.add_argument(
+        "NB", type=int, nargs="?", default=128, help="Block size (default: 128)"
+    )
+    parser.add_argument("--check", action="store_true", help="Check result (slower)")
     args = parser.parse_args()
-    
+
     N = args.N
     NB = args.NB
     check_result = args.check
-    
+
     assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
-    
-    print("="*60)
+
+    print("=" * 60)
     print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy")
-    print("="*60)
+    print("=" * 60)
     print(f"Matrix size: {N}x{N}")
     print(f"Block size: {NB}x{NB}")
-    print(f"Number of blocks: {N//NB}x{N//NB}")
+    print(f"Number of blocks: {N // NB}x{N // NB}")
     print(f"Check result: {check_result}")
-    print("="*60)
-    
+    print("=" * 60)
+
     # Create STF context
     ctx = stf.context()
-    
+
     # Create matrices
     A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
-    
+
     if check_result:
         Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Initializing matrices...")
-    print("="*60)
-    
+    print("=" * 60)
+
     # Hilbert matrix + diagonal dominance for numerical stability
     def hilbert(row, col):
         return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row)
-    
+
     A.fill(hilbert)
     if check_result:
         Aref.fill(hilbert)
-    
+
     # Measure performance
     import time
+
     start_time = time.time()
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Performing POTRI (inversion via Cholesky)...")
-    print("="*60)
-    
+    print("=" * 60)
+
     # Step 1: Cholesky factorization A = L*L^T
-    PDPOTRF(ctx, A, uplo='L')
-    
+    PDPOTRF(ctx, A, uplo="L")
+
     # Step 2: Triangular inversion L^(-1)
-    PDTRTRI(ctx, A, uplo='L', diag='N')
-    
+    PDTRTRI(ctx, A, uplo="L", diag="N")
+
     # Step 3: Compute A^(-1) = L^(-T) * L^(-1)
-    PDLAUUM(ctx, A, uplo='L')
-    
+    PDLAUUM(ctx, A, uplo="L")
+
     if check_result:
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("Verifying result...")
-        print("="*60)
-        
+        print("=" * 60)
+
         # Create test vector B
         B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri")
-        Bref_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri")
-        
+        Bref_potri = TiledMatrix(
+            ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri"
+        )
+
         def rhs_vals(row, col):
             return 1.0 * (row + 1)
-        
+
         B_potri.fill(rhs_vals)
         Bref_potri.fill(rhs_vals)
-        
+
         # Compute norm of B
         b_norm = compute_norm(ctx, Bref_potri)
-        
+
         # Create temporary matrix for result
         B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp")
-        
+
         def zero_vals(row, col):
             return 0.0
-        
+
         B_tmp.fill(zero_vals)
-        
+
         # Compute B_tmp = A^(-1) * B
-        PDSYMM(ctx, A, B_potri, B_tmp, side='L', uplo='L', alpha=1.0, beta=0.0)
-        
+        PDSYMM(ctx, A, B_potri, B_tmp, side="L", uplo="L", alpha=1.0, beta=0.0)
+
         # Compute residual: Bref = Aref * B_tmp - Bref
-        PDGEMM(ctx, Aref, B_tmp, Bref_potri, transa='N', transb='N', alpha=1.0, beta=-1.0)
-        
+        PDGEMM(
+            ctx, Aref, B_tmp, Bref_potri, transa="N", transb="N", alpha=1.0, beta=-1.0
+        )
+
         # Compute residual norm
         res_norm = compute_norm(ctx, Bref_potri)
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Finalizing STF context...")
-    print("="*60)
+    print("=" * 60)
     ctx.finalize()
-    
+
     end_time = time.time()
     elapsed_ms = (end_time - start_time) * 1000.0
-    
+
     # Compute FLOPS for POTRI
     # POTRF: (1/3) * N^3
-    # TRTRI: (1/3) * N^3  
+    # TRTRI: (1/3) * N^3
     # LAUUM: (1/3) * N^3
     # Total: N^3
     flops = float(N) ** 3
     gflops = flops / (elapsed_ms / 1000.0) / 1e9
-    
-    print("\n" + "="*60)
+
+    print("\n" + "=" * 60)
     print("Results")
-    print("="*60)
+    print("=" * 60)
     print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms")
     print(f"[POTRI] Performance: {gflops:.2f} GFLOPS")
-    
+
     if check_result:
         residual = res_norm / b_norm
         print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}")
         print(f"[POTRI] ||B||: {b_norm:.6e}")
         print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}")
-        
+
         if residual < 0.01:
             print("\n✅ Algorithm converged successfully!")
             return 0
         else:
             print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)")
             return 1
-    
-    print("="*60)
+
+    print("=" * 60)
     return 0
 
 
 if __name__ == "__main__":
     sys.exit(main())
-

From 9a5c265e80c09b518539a74e67c4a028525a5024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 24 Nov 2025 21:22:32 +0100
Subject: [PATCH 201/221] no need for numba.cuda.config.CUDA_ENABLE_PYNVJITLINK
 = 1 anymore

---
 python/cuda_cccl/cuda/stf/decorator.py               | 3 ---
 python/cuda_cccl/tests/stf/test_decorator.py         | 1 -
 python/cuda_cccl/tests/stf/test_fhe.py               | 1 -
 python/cuda_cccl/tests/stf/test_fhe_decorator.py     | 1 -
 python/cuda_cccl/tests/stf/test_numba.py             | 1 -
 python/cuda_cccl/tests/stf/test_pytorch.py           | 1 -
 python/cuda_cccl/tests/stf/test_stencil_decorator.py | 1 -
 python/cuda_cccl/tests/stf/test_token.py             | 1 -
 8 files changed, 10 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/decorator.py b/python/cuda_cccl/cuda/stf/decorator.py
index 65af9734f44..41bf71c6316 100644
--- a/python/cuda_cccl/cuda/stf/decorator.py
+++ b/python/cuda_cccl/cuda/stf/decorator.py
@@ -1,10 +1,7 @@
-import numba
 from numba import cuda
 
 from cuda.stf import context, dep, exec_place
 
-numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
-
 
 class stf_kernel_decorator:
     def __init__(self, pyfunc, jit_args, jit_kwargs):
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 16bc2539538..338c76f28a7 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -5,7 +5,6 @@
 
 import cuda.stf as stf
 
-numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index d0bbdd3d596..94aaa7210da 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -9,7 +9,6 @@
 
 import cuda.stf as stf
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 571ff8013ea..79560dd25cf 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -9,7 +9,6 @@
 
 import cuda.stf as cudastf
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index d15ae639bda..a2d30a7eb38 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -10,7 +10,6 @@
 
 import cuda.stf as stf
 
-numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index 001a7002d08..c4b337e801d 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -9,7 +9,6 @@
 
 torch = pytest.importorskip("torch")
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 from cuda.stf._stf_bindings import (  # noqa: E402
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index b4155c8b46b..07fa270ee17 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -4,7 +4,6 @@
 
 import cuda.stf as cudastf
 
-numba.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py
index acef5e34f3e..04ceb920b6a 100644
--- a/python/cuda_cccl/tests/stf/test_token.py
+++ b/python/cuda_cccl/tests/stf/test_token.py
@@ -8,7 +8,6 @@
 
 import cuda.stf as stf
 
-numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1
 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 

From e7e2adb2ace465f86db3b883a2da45d89b782995 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Mon, 24 Nov 2025 21:44:53 +0100
Subject: [PATCH 202/221] Our numba-cuda fix is part of 0.21.0

---
 python/cuda_cccl/pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index c5600f0da11..1ef761e0924 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -31,8 +31,7 @@ dependencies = [
   "numpy",
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
-  # FIXME
-  "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@main",
+  "numba-cuda>=0.21.0",
   "typing_extensions",
 ]
 

From 39040a9e9e939f0e551d7092908feb94da36832d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 11:19:58 +0100
Subject: [PATCH 203/221] Minor doc fix

---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 49ae71098af..848f0f1d5db 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -487,7 +487,7 @@ cudaStream_t stf_fence(stf_ctx_handle ctx);
 void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);
 
 //!
-//! \brief Create logical data handle from address with data place specification [PRIMARY API]
+//! \brief Create logical data handle from address with data place specification
 //!
 //! Creates logical data handle from existing memory buffer, explicitly specifying where
 //! the memory is located (host, device, managed, etc.). This is the primary and recommended

From 8f27fa2a278762007356b0f0f2feb9b1cc920fee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 11:42:09 +0100
Subject: [PATCH 204/221] Ensure matplotlib is only used if available

---
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  | 21 ++++++++++++++++---
 .../tests/stf/test_fdtd_pytorch_simplified.py | 21 ++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index a64845055ce..bfb2d7b3a56 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -1,7 +1,6 @@
 import math
 from typing import Literal, Optional, Tuple
 
-import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.cuda as tc
@@ -10,10 +9,21 @@
     context,
 )
 
+try:
+    import matplotlib.pyplot as plt
+
+    has_matplotlib = True
+except ImportError:
+    has_matplotlib = False
+
 Plane = Literal["xy", "xz", "yz"]
 
 
 def show_slice(t3d, plane="xy", index=None):
+    """Display a 2D slice of a 3D tensor (requires matplotlib)."""
+    if not has_matplotlib:
+        return
+
     # grab a 2D view
     if plane == "xy":
         idx = t3d.shape[2] // 2 if index is None else index
@@ -201,7 +211,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
             ):
                 ez = t.tensor_arguments()
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
-                show_slice(ez, plane="xy")
+                if has_matplotlib:
+                    show_slice(ez, plane="xy")
             pass
 
     ctx.finalize()
@@ -210,4 +221,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
 if __name__ == "__main__":
     # Run FDTD simulation
     print("Running FDTD 3D PyTorch example...")
-    test_fdtd_3d_pytorch(timesteps=1000, output_freq=5)
+    output_freq = 5 if has_matplotlib else 0
+    if not has_matplotlib and output_freq > 0:
+        print("Warning: matplotlib not available, running without visualization")
+        output_freq = 0
+    test_fdtd_3d_pytorch(timesteps=1000, output_freq=output_freq)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index 24af8361162..5910d0978cd 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -1,7 +1,6 @@
 import math
 from typing import Literal, Optional, Tuple
 
-import matplotlib.pyplot as plt
 import numpy as np
 import torch
 
@@ -9,10 +8,21 @@
     context,
 )
 
+try:
+    import matplotlib.pyplot as plt
+
+    has_matplotlib = True
+except ImportError:
+    has_matplotlib = False
+
 Plane = Literal["xy", "xz", "yz"]
 
 
 def show_slice(t3d, plane="xy", index=None):
+    """Display a 2D slice of a 3D tensor (requires matplotlib)."""
+    if not has_matplotlib:
+        return
+
     # grab a 2D view
     if plane == "xy":
         idx = t3d.shape[2] // 2 if index is None else index
@@ -202,7 +212,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
         if output_freq > 0 and (n % output_freq) == 0:
             with ctx.pytorch_task(lez.read()) as (ez,):
                 print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
-                show_slice(ez, plane="xy")
+                if has_matplotlib:
+                    show_slice(ez, plane="xy")
 
     ctx.finalize()
 
@@ -210,4 +221,8 @@ def source(t: float, x: float, y: float, z: float) -> float:
 if __name__ == "__main__":
     # Run simplified FDTD simulation using pytorch_task
     print("Running FDTD simulation with pytorch_task syntax...")
-    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=5)
+    output_freq = 5 if has_matplotlib else 0
+    if not has_matplotlib and output_freq > 0:
+        print("Warning: matplotlib not available, running without visualization")
+        output_freq = 0
+    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=output_freq)

From 73ac963a7f58b97ae5a777edca1c3d6893002962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 12:08:49 +0100
Subject: [PATCH 205/221] Cleanup examples

---
 python/cuda_cccl/pyproject.toml               |  2 +-
 python/cuda_cccl/tests/stf/test_context.py    |  1 -
 python/cuda_cccl/tests/stf/test_decorator.py  |  1 -
 .../cuda_cccl/tests/stf/test_fdtd_pytorch.py  |  5 +-
 .../tests/stf/test_fdtd_pytorch_simplified.py |  5 +-
 python/cuda_cccl/tests/stf/test_fhe.py        |  1 -
 .../cuda_cccl/tests/stf/test_fhe_decorator.py |  1 -
 python/cuda_cccl/tests/stf/test_numba.py      | 49 ++------------
 python/cuda_cccl/tests/stf/test_pytorch.py    | 67 ++-----------------
 .../tests/stf/test_stencil_decorator.py       |  4 +-
 python/cuda_cccl/tests/stf/test_token.py      |  4 --
 11 files changed, 19 insertions(+), 121 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 1ef761e0924..4c90e1a3e99 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
   "numpy",
   "cuda-pathfinder>=1.2.3",
   "cuda-core",
-  "numba-cuda>=0.21.0",
+  "numba-cuda @ git+https://github.com/NVIDIA/numba-cuda.git@v0.21.0",
   "typing_extensions",
 ]
 
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
index f4a583de351..451c44aadb8 100644
--- a/python/cuda_cccl/tests/stf/test_context.py
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -72,5 +72,4 @@ def test_ctx3():
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF examples...")
     test_ctx3()
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 338c76f28a7..ce8fad1d69b 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -42,5 +42,4 @@ def test_decorator(use_graph):
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF examples...")
     test_decorator(False)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
index bfb2d7b3a56..d550caba060 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -2,7 +2,9 @@
 from typing import Literal, Optional, Tuple
 
 import numpy as np
-import torch
+import pytest
+
+torch = pytest.importorskip("torch")
 import torch.cuda as tc
 
 from cuda.stf._stf_bindings import (
@@ -220,7 +222,6 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # Run FDTD simulation
-    print("Running FDTD 3D PyTorch example...")
     output_freq = 5 if has_matplotlib else 0
     if not has_matplotlib and output_freq > 0:
         print("Warning: matplotlib not available, running without visualization")
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
index 5910d0978cd..b786552b6b3 100644
--- a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -2,7 +2,9 @@
 from typing import Literal, Optional, Tuple
 
 import numpy as np
-import torch
+import pytest
+
+torch = pytest.importorskip("torch")
 
 from cuda.stf._stf_bindings import (
     context,
@@ -220,7 +222,6 @@ def source(t: float, x: float, y: float, z: float) -> float:
 
 if __name__ == "__main__":
     # Run simplified FDTD simulation using pytorch_task
-    print("Running FDTD simulation with pytorch_task syntax...")
     output_freq = 5 if has_matplotlib else 0
     if not has_matplotlib and output_freq > 0:
         print("Warning: matplotlib not available, running without visualization")
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index 94aaa7210da..e613d37ea76 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -164,5 +164,4 @@ def test_fhe():
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF FHE example...")
     test_fhe()
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 79560dd25cf..969cbe08668 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -146,5 +146,4 @@ def test_fhe_decorator():
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF FHE decorator example...")
     test_fhe_decorator()
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index a2d30a7eb38..c0ca54375e7 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -41,10 +41,7 @@ def test_numba_graph():
 
     # Verify results after finalize (data written back to host)
     # Expected: scale(2.0, 1.0) = 2.0
-    if np.allclose(X, 2.0):
-        print("✅ Graph test: X values correct: all 2.0")
-    else:
-        print(f"❌ Graph test: X values incorrect: expected 2.0, got {X[:5]}...")
+    assert np.allclose(X, 2.0)
 
 
 def test_numba():
@@ -66,7 +63,6 @@ def test_numba():
 
     with ctx.task(lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        print(nb_stream)
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
@@ -84,44 +80,13 @@ def test_numba():
     ctx.finalize()
 
     # Verify results after finalize (data written back to host)
-    print("Verifying results after finalize:")
-
     # Expected values:
     # X: scale(2.0, 1.0) = 2.0
     # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0
     # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0
-    expected_X = 2.0
-    expected_Y = 5.0
-    expected_Z = 15.0
-
-    # Check X values
-    if np.allclose(X, expected_X, rtol=1e-6, atol=1e-6):
-        print(f"✅ X values correct: all {expected_X}")
-    else:
-        actual_x = X[0] if len(X) > 0 else "N/A"
-        print(
-            f"❌ X values incorrect: expected {expected_X}, got {actual_x} (diff: {abs(actual_x - expected_X):.2e})"
-        )
-
-    # Check Y values
-    if np.allclose(Y, expected_Y, rtol=1e-6, atol=1e-6):
-        print(f"✅ Y values correct: all {expected_Y}")
-    else:
-        actual_y = Y[0] if len(Y) > 0 else "N/A"
-        print(
-            f"❌ Y values incorrect: expected {expected_Y}, got {actual_y} (diff: {abs(actual_y - expected_Y):.2e})"
-        )
-
-    # Check Z values
-    if np.allclose(Z, expected_Z, rtol=1e-6, atol=1e-6):
-        print(f"✅ Z values correct: all {expected_Z}")
-    else:
-        actual_z = Z[0] if len(Z) > 0 else "N/A"
-        print(
-            f"❌ Z values incorrect: expected {expected_Z}, got {actual_z} (diff: {abs(actual_z - expected_Z):.2e})"
-        )
-
-    print(f"Sample values: X[0]={X[0]}, Y[0]={Y[0]}, Z[0]={Z[0]}")
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 5.0)
+    assert np.allclose(Z, 15.0)
 
 
 @cuda.jit
@@ -196,8 +161,7 @@ def test_numba2d():
     u_out_ref[:, -1] = u[:, -1]
 
     # compare with the GPU result
-    max_abs_diff = np.abs(u_out - u_out_ref).max()
-    print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
+    assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6)
 
 
 def test_numba_exec_place():
@@ -218,7 +182,6 @@ def test_numba_exec_place():
 
     with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        print(nb_stream)
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
@@ -261,7 +224,6 @@ def test_numba_places():
 
     with ctx.task(lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        print(nb_stream)
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[32, 64, nb_stream](2.0, dX, dY)
@@ -280,6 +242,5 @@ def test_numba_places():
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF examples...")
     test_numba_graph()
     # test_numba()
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
index c4b337e801d..02a7bc1c1b3 100644
--- a/python/cuda_cccl/tests/stf/test_pytorch.py
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -58,40 +58,13 @@ def test_pytorch():
     ctx.finalize()
 
     # Verify results on host after finalize
-    print("Verifying results...")
-
     # Expected values:
     # X: 1.0 -> 2.0 (multiplied by 2)
     # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
     # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
-
-    expected_X = 2.0
-    expected_Y = 4.0
-    expected_Z = 5.0
-
-    # Check a few values to verify correctness
-    assert np.allclose(X[:10], expected_X), (
-        f"X mismatch: got {X[:10]}, expected {expected_X}"
-    )
-    assert np.allclose(Y[:10], expected_Y), (
-        f"Y mismatch: got {Y[:10]}, expected {expected_Y}"
-    )
-    assert np.allclose(Z[:10], expected_Z), (
-        f"Z mismatch: got {Z[:10]}, expected {expected_Z}"
-    )
-
-    # Check entire arrays
-    assert np.all(X == expected_X), (
-        f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}"
-    )
-    assert np.all(Y == expected_Y), (
-        f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}"
-    )
-    assert np.all(Z == expected_Z), (
-        f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}"
-    )
-
-    print(f"✅ All checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}")
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 4.0)
+    assert np.allclose(Z, 5.0)
 
 
 def test_pytorch_task():
@@ -132,42 +105,14 @@ def test_pytorch_task():
     ctx.finalize()
 
     # Verify results on host after finalize (same as original test)
-    print("Verifying pytorch_task results...")
-
     # Expected values:
     # X: 1.0 -> 2.0 (multiplied by 2)
     # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
     # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
-
-    expected_X = 2.0
-    expected_Y = 4.0
-    expected_Z = 5.0
-
-    # Check a few values to verify correctness
-    assert np.allclose(X[:10], expected_X), (
-        f"X mismatch: got {X[:10]}, expected {expected_X}"
-    )
-    assert np.allclose(Y[:10], expected_Y), (
-        f"Y mismatch: got {Y[:10]}, expected {expected_Y}"
-    )
-    assert np.allclose(Z[:10], expected_Z), (
-        f"Z mismatch: got {Z[:10]}, expected {expected_Z}"
-    )
-
-    # Check entire arrays
-    assert np.all(X == expected_X), (
-        f"X array not uniform: min={X.min()}, max={X.max()}, expected={expected_X}"
-    )
-    assert np.all(Y == expected_Y), (
-        f"Y array not uniform: min={Y.min()}, max={Y.max()}, expected={expected_Y}"
-    )
-    assert np.all(Z == expected_Z), (
-        f"Z array not uniform: min={Z.min()}, max={Z.max()}, expected={expected_Z}"
-    )
-
-    print(f"✅ All pytorch_task checks passed! X={X[0]}, Y={Y[0]}, Z={Z[0]}")
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 4.0)
+    assert np.allclose(Z, 5.0)
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF examples...")
     test_pytorch()
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
index 07fa270ee17..e8571edeae3 100644
--- a/python/cuda_cccl/tests/stf/test_stencil_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -76,10 +76,8 @@ def test_numba2d():
     u_out_ref[:, -1] = u[:, -1]
 
     # compare with the GPU result
-    max_abs_diff = np.abs(u_out - u_out_ref).max()
-    print(f"max(|gpu - ref|) = {max_abs_diff:.3e}")
+    assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6)
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF stencil decorator example...")
     test_numba2d()
diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py
index 04ceb920b6a..abadab8305d 100644
--- a/python/cuda_cccl/tests/stf/test_token.py
+++ b/python/cuda_cccl/tests/stf/test_token.py
@@ -56,7 +56,6 @@ def test_numba_token():
 
     with ctx.task(lX.read(), lY.rw(), token.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
-        print(nb_stream)
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
         axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
@@ -76,10 +75,7 @@ def test_numba_token():
     assert np.allclose(Y, 5.0), (
         f"Y should be 5.0 after two axpy operations, but got {Y[0]}"
     )
-    print(f"✓ X = {X[0]} (expected 1.0)")
-    print(f"✓ Y = {Y[0]} (expected 5.0)")
 
 
 if __name__ == "__main__":
-    print("Running CUDASTF examples...")
     test_token()

From d90ed649bde1b1377e951c1cc1c884f195e2c6ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 13:41:18 +0100
Subject: [PATCH 206/221] cmake fix

---
 python/cuda_cccl/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 0e4625b9f25..ae855cf60d4 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -18,12 +18,6 @@ message(
 # Build cccl.c.parallel and add CCCL's install rules
 set(_cccl_root ../..)
 
-include(${_cccl_root}/cmake/AppendOptionIfAvailable.cmake)
-include(${_cccl_root}/cmake/CCCLConfigureTarget.cmake)
-include(${_cccl_root}/cmake/CCCLBuildCompilerTargets.cmake)
-include(${_cccl_root}/cmake/CCCLGetDependencies.cmake)
-cccl_build_compiler_targets()
-
 # Build and install C++ library first
 set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds
 set(CCCL_ENABLE_C_PARALLEL ON)

From eb77519bcfa33cbb966d9cad8686034b7e7ac031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 14:13:37 +0100
Subject: [PATCH 207/221] Cmake fixes (need extra cleanup)

---
 python/cuda_cccl/CMakeLists.txt | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index ae855cf60d4..ebabc475330 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -22,9 +22,19 @@ set(_cccl_root ../..)
 set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds
 set(CCCL_ENABLE_C_PARALLEL ON)
 set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory)
-set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build
-set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) # Testing belongs in CI, not Python build
 set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features
+
+# Disable all testing, examples, and benchmarks - we only want the libraries
+set(CCCL_ENABLE_TESTING OFF)
+set(CCCL_ENABLE_EXAMPLES OFF)
+set(CCCL_ENABLE_BENCHMARKS OFF)
+set(CCCL_C_PARALLEL_ENABLE_TESTING OFF)
+set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF)
+# Note: CCCL_ENABLE_CUDAX must be ON because STF depends on it (via CCCL_ENABLE_UNSTABLE)
+# But disable cudax tests, examples, and header testing
+set(cudax_ENABLE_TESTING OFF)
+set(cudax_ENABLE_EXAMPLES OFF)
+set(cudax_ENABLE_HEADER_TESTING OFF)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
 
@@ -42,6 +52,11 @@ add_subdirectory(${_cccl_root} _parent_cccl)
 set(CMAKE_INSTALL_LIBDIR "${old_libdir}") # pop
 set(CMAKE_INSTALL_INCLUDEDIR "${old_includedir}") # pop
 
+# Create CCCL::cudax alias for STF (normally created by cccl-config.cmake)
+if (TARGET cudax::cudax AND NOT TARGET CCCL::cudax)
+  add_library(CCCL::cudax ALIAS cudax::cudax)
+endif()
+
 # ensure the destination directory exists
 file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl")
 file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl")

From b38ff802e4cc6cf15ac79dfa0518e42f8b8ef69d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 15:58:20 +0100
Subject: [PATCH 208/221] Work-around for lazy resource init during graph
 capture in cuda core

---
 python/cuda_cccl/tests/stf/test_decorator.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index ce8fad1d69b..2b4552e386b 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -4,6 +4,7 @@
 from numba import cuda
 
 import cuda.stf as stf
+from cuda.core.experimental import Device
 
 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
@@ -26,6 +27,10 @@ def scale(a, x):
 def test_decorator(use_graph):
     X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
 
+    # XXX Work-around to force the initialization of CUDA devices in cuda.core and
+    # avoid lazy resource init during graph capture.
+    Device().set_current()
+
     ctx = stf.context(use_graph=use_graph)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)

From 0a3e6671bc52837ce2b0a1e367d403e0ae4166d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 16:41:04 +0100
Subject: [PATCH 209/221] Use a relaxed capture mode

---
 cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index b3a24b8fbdd..a070fc446ff 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -104,7 +104,7 @@ public:
     {
       // Select a stream from the pool
       capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
-      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
+      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
     }
 
     auto& dot = *ctx.get_dot();
@@ -365,7 +365,7 @@ public:
       capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
 
       cudaGraph_t childGraph = nullptr;
-      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
+      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
 
       // Launch the user provided function
       f(capture_stream);
@@ -625,7 +625,7 @@ public:
       cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
 
       cudaGraph_t childGraph = nullptr;
-      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
+      cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
 
       // Launch the user provided function
       if constexpr (fun_invocable_stream_deps)

From 8642fdd92efdaa4d63eac3042f3f910a253d0f08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 16:46:50 +0100
Subject: [PATCH 210/221] This work-around is not needed anymore with a relaxed
 capture mode

---
 python/cuda_cccl/tests/stf/test_decorator.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
index 2b4552e386b..ce8fad1d69b 100644
--- a/python/cuda_cccl/tests/stf/test_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -4,7 +4,6 @@
 from numba import cuda
 
 import cuda.stf as stf
-from cuda.core.experimental import Device
 
 numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
@@ -27,10 +26,6 @@ def scale(a, x):
 def test_decorator(use_graph):
     X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
 
-    # XXX Work-around to force the initialization of CUDA devices in cuda.core and
-    # avoid lazy resource init during graph capture.
-    Device().set_current()
-
     ctx = stf.context(use_graph=use_graph)
     lX = ctx.logical_data(X)
     lY = ctx.logical_data(Y)

From 0f9865d99866561db59aa5aadf3d7f24bec3b6f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 17:27:51 +0100
Subject: [PATCH 211/221] cleanup warp example

---
 .../cuda_cccl/tests/stf/example_fluid_warp.py | 86 +------------------
 1 file changed, 2 insertions(+), 84 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
index c1d903b9be7..ab3fd406864 100644
--- a/python/cuda_cccl/tests/stf/example_fluid_warp.py
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -29,53 +29,13 @@
 import cuda.stf as cudastf
 
 
+# Add a stf-specific decorator to the wp. namespace
 def stf_kernel(pyfunc):
     # let warp decorate normally
     kernel = wp.kernel(pyfunc)
 
     # attach an STF-aware call operator
     def _stf_call(*args, dim=None, stream=None, **kwargs):
-        print(f"[STF TRACE] {pyfunc.__name__}")
-        print(f"  dim={dim}, stream={stream}")
-
-        # Enhanced arg display with logical data detection
-        if args:
-            print("  args=[")
-            for i, arg in enumerate(args):
-                # Detect if argument is or contains STF logical data
-                is_logical_data = False
-                symbol = None
-
-                # Check if arg is directly STF logical data
-                if hasattr(arg, "__class__") and "logical_data" in str(type(arg)):
-                    is_logical_data = True
-                    if hasattr(arg, "symbol") and arg.symbol:
-                        symbol = arg.symbol
-                # Check if arg has attached STF logical data (Warp array)
-                elif hasattr(arg, "_stf_ld"):
-                    is_logical_data = True
-                    if hasattr(arg._stf_ld, "symbol") and arg._stf_ld.symbol:
-                        symbol = arg._stf_ld.symbol
-                # Fallback to _name for Warp arrays
-                elif hasattr(arg, "_name") and arg._name:
-                    symbol = arg._name
-
-                if is_logical_data:
-                    if symbol:
-                        print(f"    [{i}]: '{symbol}' [logical_data]")
-                    else:
-                        print(f"    [{i}]: logical_data")
-                else:
-                    # Regular arguments (scalars, etc.)
-                    if hasattr(arg, "shape"):  # Array-like but not logical data
-                        print(f"    [{i}]: {type(arg).__name__}")
-                    else:  # Scalar value
-                        print(f"    [{i}]: {arg}")
-            print("  ]")
-        else:
-            print(f"  args={args}")
-
-        print(f"  kwargs={kwargs}")
         return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs)
 
     # monkey-patch a method onto the kernel object
@@ -85,49 +45,6 @@ def _stf_call(*args, dim=None, stream=None, **kwargs):
 
 
 def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
-    print(f"[STF TRACE] launching kernel: {getattr(kernel, '__name__', kernel)}")
-    print(f"  dim     = {dim}")
-    print(f"  stream  = {stream}")
-
-    # Enhanced input display with logical data detection
-    if inputs:
-        print("  inputs  = [")
-        for i, inp in enumerate(inputs):
-            # Detect if input is or contains STF logical data
-            is_logical_data = False
-            symbol = None
-
-            # Check if inp is directly STF logical data
-            if hasattr(inp, "__class__") and "logical_data" in str(type(inp)):
-                is_logical_data = True
-                if hasattr(inp, "symbol") and inp.symbol:
-                    symbol = inp.symbol
-            # Check if inp has attached STF logical data (Warp array)
-            elif hasattr(inp, "_stf_ld"):
-                is_logical_data = True
-                if hasattr(inp._stf_ld, "symbol") and inp._stf_ld.symbol:
-                    symbol = inp._stf_ld.symbol
-            # Fallback to _name for Warp arrays
-            elif hasattr(inp, "_name") and inp._name:
-                symbol = inp._name
-
-            if is_logical_data:
-                if symbol:
-                    print(f"    [{i}]: '{symbol}' [logical_data]")
-                else:
-                    print(f"    [{i}]: logical_data")
-            else:
-                # Regular arguments (scalars, etc.)
-                if hasattr(inp, "shape"):  # Array-like but not logical data
-                    print(f"    [{i}]: {type(inp).__name__}")
-                else:  # Scalar value
-                    print(f"    [{i}]: {inp}")
-        print("  ]")
-    else:
-        print(f"  inputs  = {inputs}")
-
-    print(f"  kwargs  = {kwargs}")
-
     # just forward to warp for now
     return wp.launch(
         kernel,
@@ -387,6 +304,7 @@ def step(self):
                 self.p0.zero_()
                 self.p1.zero_()
 
+                # TODO experiment with explicit capture at Warp level
                 # if self.use_cuda_graph:
                 #     wp.capture_launch(self.graph)
                 # else:

From 6466347ed68b0e369348029424d82e41018c429f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 25 Nov 2025 17:41:10 +0100
Subject: [PATCH 212/221] Cleanups in the cython code for STF

---
 python/cuda_cccl/cuda/stf/_stf_bindings.py    | 22 ++++++++++++--
 .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 29 +++++--------------
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py
index c61e908fc8d..169490739ee 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings.py
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings.py
@@ -27,9 +27,26 @@
 
 
 def _load_cuda_libraries():
-    # Load appropriate libraries for the detected CUDA version
+    """
+    Preload CUDA libraries to ensure proper symbol resolution.
+
+    These libraries are indirect dependencies pulled in via cccl.c.parallel.
+    Preloading ensures reliable symbol resolution regardless of dynamic linker behavior.
+    """
+    import warnings
+
     for libname in ("nvrtc", "nvJitLink"):
-        load_nvidia_dynamic_lib(libname)
+        try:
+            load_nvidia_dynamic_lib(libname)
+        except Exception as e:
+            # Log warning but don't fail - the extension might still work
+            # if the libraries are already loaded or available through other means
+            warnings.warn(
+                f"Failed to preload CUDA library '{libname}': {e}. "
+                f"STF bindings may fail to load if {libname} is not available.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
 
 
 _load_cuda_libraries()
@@ -53,4 +70,5 @@ def _load_cuda_libraries():
 except ImportError as e:
     raise ImportError(
         f"Failed to import CUDA STF bindings for CUDA {cuda_version}. "
+        f"Ensure cuda-cccl is properly installed with: pip install cuda-cccl[cu{cuda_version}]"
     ) from e
diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index 00b8dd39ed5..afc2f431192 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -6,24 +6,19 @@
 # Make sure to update PYI with change to Python API to ensure that Python
 # static type checker tools like mypy green-lights cuda.cccl.parallel
 
-from cpython.buffer cimport Py_buffer, PyObject_GetBuffer, PyBuffer_Release
-from cpython.buffer cimport Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyObject_GetBuffer, PyBuffer_Release
-from cpython.bytes cimport PyBytes_FromStringAndSize
-from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
-from libc.stdint cimport uintptr_t
-from libc.string cimport memset, memcpy
-import math # for math.prod
-
-# TODO remove that dependency
-import numpy as np
-
 from cpython.buffer cimport (
-    Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
-    PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer
+    Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
+    PyObject_GetBuffer, PyBuffer_Release, PyObject_CheckBuffer
 )
+from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.pycapsule cimport (
     PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer
 )
+from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
+from libc.string cimport memset, memcpy
+import math # for math.prod
+
+import numpy as np
 
 import ctypes
 from enum import IntFlag
@@ -38,9 +33,6 @@ cdef extern from "<cuda.h>":
     ctypedef OpaqueCUkernel_st *CUkernel
     ctypedef OpaqueCUlibrary_st *CUlibrary
 
-#typedef struct CUstream_st* cudaStream_t;
-
-
 cdef extern from "cccl/c/experimental/stf/stf.h":
     #
     # Contexts
@@ -132,7 +124,6 @@ cdef extern from "cccl/c/experimental/stf/stf.h":
     void stf_task_end(stf_task_handle t)
     void stf_task_enable_capture(stf_task_handle t)
     CUstream stf_task_get_custream(stf_task_handle t)
-    # cudaStream_t stf_task_get_stream(stf_task_handle t)
     void* stf_task_get(stf_task_handle t, int submitted_index)
     void stf_task_destroy(stf_task_handle t)
 
@@ -222,8 +213,6 @@ cdef class logical_data:
                     # Unknown vector type - treat as original
                     self._shape = original_shape
                     self._dtype = np.dtype(typestr)
-
-                print(f"STF: Automatically flattened vector type {typestr} -> {self._dtype} with shape {self._shape}")
             else:
                 # Regular scalar type
                 self._shape = original_shape
@@ -472,7 +461,6 @@ cdef class task:
     def __dealloc__(self):
         if self._t != NULL:
              stf_task_destroy(self._t)
-#        self._lds_args.clear()
 
     def start(self):
         # This is ignored if this is not a graph task
@@ -648,7 +636,6 @@ cdef class context:
             raise RuntimeError("cannot call borrow_from_handle on this context")
 
         self._ctx = ctx_handle
-        # print(f"borrowing ... new ctx handle = {<int>ctx_handle} self={self}")
 
     def __repr__(self):
         return f"context(handle={<int>self._ctx}, borrowed={self._borrowed})"

From cfb2930ca2118d1ca80e5ead273bbb36423fa269 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 08:28:05 +0100
Subject: [PATCH 213/221] no need for math.prod for such a simple thing

---
 python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index afc2f431192..4938dc2b970 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -16,7 +16,6 @@ from cpython.pycapsule cimport (
 )
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
 from libc.string cimport memset, memcpy
-import math # for math.prod
 
 import numpy as np
 
@@ -333,7 +332,10 @@ cdef class logical_data:
         out._dtype = np.dtype(dtype)
         out._shape = shape
         out._ndim  = len(shape)
-        out._len   = math.prod(shape) * out._dtype.itemsize
+        cdef size_t total_items = 1
+        for dim in shape:
+            total_items *= dim
+        out._len   = total_items * out._dtype.itemsize
         out._symbol = None  # New object has no symbol initially
         out._is_token = False
         stf_logical_data_empty(ctx._ctx, out._len, &out._ld)

From 130ee2a5cfec2469be494501d15610d0df6abdbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 08:33:39 +0100
Subject: [PATCH 214/221] Simpler code to handle vector types

---
 .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 27 +++++++------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index 4938dc2b970..31c499ecbe4 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -194,28 +194,21 @@ cdef class logical_data:
             original_shape = cai['shape']
             typestr = cai['typestr']
 
-            # Handle vector types automatically (e.g., wp.vec2, wp.vec3)
-            # STF treats these as flat scalar arrays with an additional dimension
-            if typestr.startswith('|V'):  # Vector type (e.g., '|V8' for vec2, '|V12' for vec3)
-                vector_size = int(typestr[2:])  # Extract size from '|V8' -> 8 bytes
-
-                if vector_size == 8:  # vec2 (2 * 4 bytes float32)
-                    self._shape = original_shape + (2,)
-                    self._dtype = np.dtype('<f4')  # float32
-                elif vector_size == 12:  # vec3 (3 * 4 bytes float32)
-                    self._shape = original_shape + (3,)
-                    self._dtype = np.dtype('<f4')  # float32
-                elif vector_size == 16:  # vec4 (4 * 4 bytes float32)
-                    self._shape = original_shape + (4,)
-                    self._dtype = np.dtype('<f4')  # float32
-                else:
-                    # Unknown vector type - treat as original
+            # Handle vector types (e.g., wp.vec2, wp.vec3)
+            # Use structured dtype from descr if available
+            if typestr.startswith('|V'):
+                # Vector/structured type - use descr field if available
+                if 'descr' in cai:
+                    self._dtype = np.dtype(cai['descr'])
                     self._shape = original_shape
+                else:
+                    # No descr field - treat as opaque bytes
                     self._dtype = np.dtype(typestr)
+                    self._shape = original_shape
             else:
                 # Regular scalar type
-                self._shape = original_shape
                 self._dtype = np.dtype(typestr)
+                self._shape = original_shape
 
             self._ndim = len(self._shape)
 

From 4bb4d2350fd8df86c1010ffa9276c34a94b808da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 08:38:23 +0100
Subject: [PATCH 215/221] fix grid dimension

---
 python/cuda_cccl/tests/stf/test_numba.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
index c0ca54375e7..bd818e13894 100644
--- a/python/cuda_cccl/tests/stf/test_numba.py
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -55,27 +55,29 @@ def test_numba():
     lY = ctx.logical_data(Y)
     lZ = ctx.logical_data(Z)
 
+    threads_per_block = 256
+    blocks = (n + threads_per_block - 1) // threads_per_block
+
     with ctx.task(lX.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.numba_arguments()
-        # dX = cuda.from_cuda_array_interface(t.get_arg_cai(0), owner=None, sync=False)
-        scale[32, 64, nb_stream](2.0, dX)
+        scale[blocks, threads_per_block, nb_stream](2.0, dX)
 
     with ctx.task(lX.read(), lY.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX = t.get_arg_numba(0)
         dY = t.get_arg_numba(1)
-        axpy[32, 64, nb_stream](2.0, dX, dY)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
 
     with ctx.task(lX.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dX, dZ = t.numba_arguments()
-        axpy[32, 64, nb_stream](2.0, dX, dZ)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dZ)
 
     with ctx.task(lY.read(), lZ.rw()) as t:
         nb_stream = cuda.external_stream(t.stream_ptr())
         dY, dZ = t.numba_arguments()
-        axpy[32, 64, nb_stream](2.0, dY, dZ)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dY, dZ)
 
     ctx.finalize()
 

From b8c745ea1db7f842b479ad91263388d9488c7809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 08:39:59 +0100
Subject: [PATCH 216/221] Use from_dlpack

---
 python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
index 945597fb360..0ec74de3ed7 100644
--- a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
+++ b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
@@ -18,7 +18,7 @@ def cai_to_torch(cai: dict):
         from numba import cuda as _cuda
 
         dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False)
-        return torch.utils.dlpack.from_dlpack(dev_array.to_dlpack())
+        return torch.from_dlpack(dev_array)
     except Exception:
         pass
 
@@ -31,7 +31,7 @@ def __init__(self, d):
                 self.__cuda_array_interface__ = d
 
         cp_arr = cp.asarray(_cai_wrapper(cai))
-        return torch.utils.dlpack.from_dlpack(cp_arr.toDlpack())
+        return torch.from_dlpack(cp_arr)
     except Exception as e:
         raise RuntimeError(
             "Could not convert __cuda_array_interface__ to torch.Tensor. "

From fb2a3baad84202a45010f06369b640029a8210b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 10:09:01 +0100
Subject: [PATCH 217/221] Change the mock-up FHE toy example to have operations
 that are homomorphic, not XOR

---
 python/cuda_cccl/tests/stf/test_fhe.py        | 98 ++++++++-----------
 .../cuda_cccl/tests/stf/test_fhe_decorator.py | 89 ++++++++---------
 2 files changed, 86 insertions(+), 101 deletions(-)

diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
index e613d37ea76..b2bb9961b84 100644
--- a/python/cuda_cccl/tests/stf/test_fhe.py
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-# A toy example to illustrate how we can compose logical operations
+# Toy Fully Homomorphic Encryption (FHE) example with addition encryption
 
 import numba
 from numba import cuda
@@ -13,9 +13,9 @@
 
 
 class Plaintext:
-    # Initialize from actual values, or from a logical data
-    def __init__(self, ctx, values=None, ld=None):
+    def __init__(self, ctx, values=None, ld=None, key=0x42):
         self.ctx = ctx
+        self.key = key
         if ld is not None:
             self.l = ld
         if values is not None:
@@ -28,8 +28,8 @@ def set_symbol(self, symbol: str):
         self.symbol = symbol
 
     def encrypt(self) -> "Ciphertext":
-        encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
-        return Ciphertext(self.ctx, values=encrypted)
+        encrypted = bytearray([(c + self.key) & 0xFF for c in self.values])
+        return Ciphertext(self.ctx, values=encrypted, key=self.key)
 
     def print_values(self):
         with ctx.task(
@@ -42,36 +42,30 @@ def print_values(self):
 
 
 @cuda.jit
-def and_kernel(a, b, out):
+def add_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = a[i] & b[i]
+        out[i] = (a[i] + b[i]) & 0xFF
 
 
 @cuda.jit
-def or_kernel(a, b, out):
+def sub_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = a[i] | b[i]
+        out[i] = (a[i] - b[i]) & 0xFF
 
 
 @cuda.jit
-def not_kernel(a, out):
+def sub_scalar_kernel(a, out, v):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = ~a[i]
-
-
-@cuda.jit
-def xor_kernel(a, out, v):
-    i = cuda.grid(1)
-    if i < out.size:
-        out[i] = a[i] ^ v
+        out[i] = (a[i] - v) & 0xFF
 
 
 class Ciphertext:
-    def __init__(self, ctx, values=None, ld=None):
+    def __init__(self, ctx, values=None, ld=None, key=0x42):
         self.ctx = ctx
+        self.key = key
         if ld is not None:
             self.l = ld
         if values is not None:
@@ -79,72 +73,52 @@ def __init__(self, ctx, values=None, ld=None):
             self.l = ctx.logical_data(self.values)
         self.symbol = None
 
-    # ~ operator
-    def __invert__(self):
-        result = self.like_empty()
-
-        with ctx.task(self.l.read(), result.l.write()) as t:
-            nb_stream = cuda.external_stream(t.stream_ptr())
-            da, dresult = t.numba_arguments()
-            not_kernel[32, 16, nb_stream](da, dresult)
-
-        return result
-
-    # | operator
-    def __or__(self, other):
+    def __add__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
-
         result = self.like_empty()
-
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             da, db, dresult = t.numba_arguments()
-            or_kernel[32, 16, nb_stream](da, db, dresult)
-
+            add_kernel[32, 16, nb_stream](da, db, dresult)
         return result
 
-    # & operator
-    def __and__(self, other):
+    def __sub__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
-
         result = self.like_empty()
-
         with ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
-            nb_stream.synchronize()
             da, db, dresult = t.numba_arguments()
-            and_kernel[32, 16, nb_stream](da, db, dresult)
-
+            sub_kernel[32, 16, nb_stream](da, db, dresult)
         return result
 
     def set_symbol(self, symbol: str):
         self.l.set_symbol(symbol)
         self.symbol = symbol
 
-    def decrypt(self):
+    def decrypt(self, num_operands=2):
+        """Decrypt by subtracting num_operands * key"""
         result = self.like_empty()
-
+        total_key = (num_operands * self.key) & 0xFF
         with ctx.task(self.l.read(), result.l.write()) as t:
             nb_stream = cuda.external_stream(t.stream_ptr())
             da, dresult = t.numba_arguments()
-            # reverse the toy XOR "encryption"
-            xor_kernel[32, 16, nb_stream](da, dresult, 0x42)
-
-        return Plaintext(self.ctx, ld=result.l)
+            sub_scalar_kernel[32, 16, nb_stream](da, dresult, total_key)
+        return Plaintext(self.ctx, ld=result.l, key=self.key)
 
     def like_empty(self):
         return Ciphertext(self.ctx, ld=self.l.like_empty())
 
 
-def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
-    return ~((eA | ~eB) & (~eA | eB))
+def circuit(a, b):
+    """Circuit: (A + B) + (B - A) = 2*B"""
+    return (a + b) + (b - a)
 
 
 def test_fhe():
-    """Test Fully Homomorphic Encryption (FHE) example with logical operations."""
-    global ctx  # Make ctx accessible to the classes
+    """Test FHE using manual task creation with addition encryption."""
+    global ctx
     ctx = stf.context(use_graph=False)
 
     vA = [3, 3, 2, 2, 17]
@@ -155,13 +129,27 @@ def test_fhe():
     pB = Plaintext(ctx, vB)
     pB.set_symbol("B")
 
+    expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)]
+
     eA = pA.encrypt()
     eB = pB.encrypt()
-    out = circuit(eA, eB)
+    encrypted_out = circuit(eA, eB)
+    decrypted_out = encrypted_out.decrypt(num_operands=2)
+
+    with ctx.task(
+        stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed())
+    ) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        nb_stream.synchronize()
+        hvalues = t.numba_arguments()
+        actual = [int(v) for v in hvalues]
 
-    out.decrypt().print_values()
     ctx.finalize()
 
+    assert actual == expected, (
+        f"Decrypted result {actual} doesn't match expected {expected}"
+    )
+
 
 if __name__ == "__main__":
     test_fhe()
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
index 969cbe08668..980f7735ddc 100644
--- a/python/cuda_cccl/tests/stf/test_fhe_decorator.py
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-# A toy example to illustrate how we can compose logical operations
+# Toy Fully Homomorphic Encryption (FHE) example with addition encryption
 
 import numba
 from numba import cuda
@@ -13,9 +13,9 @@
 
 
 class Plaintext:
-    # Initialize from actual values, or from a logical data
-    def __init__(self, ctx, values=None, ld=None):
+    def __init__(self, ctx, values=None, ld=None, key=0x42):
         self.ctx = ctx
+        self.key = key
         if ld is not None:
             self.l = ld
         if values is not None:
@@ -28,8 +28,8 @@ def set_symbol(self, symbol: str):
         self.symbol = symbol
 
     def encrypt(self) -> "Ciphertext":
-        encrypted = bytearray([c ^ 0x42 for c in self.values])  # toy XOR
-        return Ciphertext(self.ctx, values=encrypted)
+        encrypted = bytearray([(c + self.key) & 0xFF for c in self.values])
+        return Ciphertext(self.ctx, values=encrypted, key=self.key)
 
     def print_values(self):
         with ctx.task(
@@ -42,36 +42,30 @@ def print_values(self):
 
 
 @cudastf.jit
-def and_kernel(a, b, out):
+def add_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = a[i] & b[i]
+        out[i] = (a[i] + b[i]) & 0xFF
 
 
 @cudastf.jit
-def or_kernel(a, b, out):
+def sub_kernel(a, b, out):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = a[i] | b[i]
+        out[i] = (a[i] - b[i]) & 0xFF
 
 
 @cudastf.jit
-def not_kernel(a, out):
+def sub_scalar_kernel(a, out, v):
     i = cuda.grid(1)
     if i < out.size:
-        out[i] = ~a[i]
-
-
-@cudastf.jit
-def xor_kernel(a, out, v):
-    i = cuda.grid(1)
-    if i < out.size:
-        out[i] = a[i] ^ v
+        out[i] = (a[i] - v) & 0xFF
 
 
 class Ciphertext:
-    def __init__(self, ctx, values=None, ld=None):
+    def __init__(self, ctx, values=None, ld=None, key=0x42):
         self.ctx = ctx
+        self.key = key
         if ld is not None:
             self.l = ld
         if values is not None:
@@ -79,54 +73,43 @@ def __init__(self, ctx, values=None, ld=None):
             self.l = ctx.logical_data(self.values)
         self.symbol = None
 
-    # ~ operator
-    def __invert__(self):
-        result = self.like_empty()
-        not_kernel[32, 16](self.l.read(), result.l.write())
-
-        return result
-
-    # | operator
-    def __or__(self, other):
+    def __add__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
-
         result = self.like_empty()
-        or_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
-
+        add_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
         return result
 
-    # & operator
-    def __and__(self, other):
+    def __sub__(self, other):
         if not isinstance(other, Ciphertext):
             return NotImplemented
-
         result = self.like_empty()
-        and_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
-
+        sub_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
         return result
 
     def set_symbol(self, symbol: str):
         self.l.set_symbol(symbol)
         self.symbol = symbol
 
-    def decrypt(self):
+    def decrypt(self, num_operands=2):
+        """Decrypt by subtracting num_operands * key"""
         result = self.like_empty()
-        xor_kernel[32, 16](self.l.read(), result.l.write(), 0x42)
-
-        return Plaintext(self.ctx, ld=result.l)
+        total_key = (num_operands * self.key) & 0xFF
+        sub_scalar_kernel[32, 16](self.l.read(), result.l.write(), total_key)
+        return Plaintext(self.ctx, ld=result.l, key=self.key)
 
     def like_empty(self):
         return Ciphertext(self.ctx, ld=self.l.like_empty())
 
 
-def circuit(eA: Ciphertext, eB: Ciphertext) -> Ciphertext:
-    return ~((eA | ~eB) & (~eA | eB))
+def circuit(a, b):
+    """Circuit: (A + B) + (B - A) = 2*B"""
+    return (a + b) + (b - a)
 
 
 def test_fhe_decorator():
-    """Test Fully Homomorphic Encryption (FHE) example using @cudastf.jit decorators."""
-    global ctx  # Make ctx accessible to the classes
+    """Test FHE using @cudastf.jit decorators with addition encryption."""
+    global ctx
     ctx = cudastf.context(use_graph=False)
 
     vA = [3, 3, 2, 2, 17]
@@ -137,13 +120,27 @@ def test_fhe_decorator():
     pB = Plaintext(ctx, vB)
     pB.set_symbol("B")
 
+    expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)]
+
     eA = pA.encrypt()
     eB = pB.encrypt()
-    out = circuit(eA, eB)
+    encrypted_out = circuit(eA, eB)
+    decrypted_out = encrypted_out.decrypt(num_operands=2)
+
+    with ctx.task(
+        cudastf.exec_place.host(), decrypted_out.l.read(cudastf.data_place.managed())
+    ) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        nb_stream.synchronize()
+        hvalues = t.numba_arguments()
+        actual = [int(v) for v in hvalues]
 
-    out.decrypt().print_values()
     ctx.finalize()
 
+    assert actual == expected, (
+        f"Decrypted result {actual} doesn't match expected {expected}"
+    )
+
 
 if __name__ == "__main__":
     test_fhe_decorator()

From da2e1aa7c6b5fd4a3b9ce6296648147df837c685 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 10:16:22 +0100
Subject: [PATCH 218/221] Add some explanation for the use of a relaxed capture
 mode

---
 cudax/include/cuda/experimental/__stf/graph/graph_task.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index a070fc446ff..ee8c5435822 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -104,6 +104,8 @@ public:
     {
       // Select a stream from the pool
       capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      // Use relaxed capture mode to allow capturing workloads that lazily initialize
+      // resources (e.g., set up memory pools)
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
     }
 
@@ -365,6 +367,8 @@ public:
       capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
 
       cudaGraph_t childGraph = nullptr;
+      // Use relaxed capture mode to allow capturing workloads that lazily initialize
+      // resources (e.g., set up memory pools)
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
 
       // Launch the user provided function
@@ -625,6 +629,8 @@ public:
       cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
 
       cudaGraph_t childGraph = nullptr;
+      // Use relaxed capture mode to allow capturing workloads that lazily initialize
+      // resources (e.g., set up memory pools)
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
 
       // Launch the user provided function

From 852b4005c9bbc006744a63e24d4687d6c55d4a2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 12:52:34 +0100
Subject: [PATCH 219/221] cleaner pytorch adapter

---
 .../cuda/stf/_adapters/torch_bridge.py        | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
index 0ec74de3ed7..0e7686ea363 100644
--- a/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
+++ b/python/cuda_cccl/cuda/stf/_adapters/torch_bridge.py
@@ -6,34 +6,11 @@ def cai_to_torch(cai: dict):
     Convert a __cuda_array_interface__ dict to a torch.Tensor
     without making PyTorch a hard dependency of the core extension.
 
-    Strategy (in order):
-      1) Try Numba -> DLPack -> torch (fast & common).
-      2) Try CuPy  -> DLPack -> torch (common on CUDA setups).
-      3) Otherwise, error with a clear message.
+    Uses Numba (a required dependency) to create a DeviceNDArray,
+    which torch.as_tensor can consume directly via __cuda_array_interface__.
     """
     import torch
+    from numba import cuda as _cuda
 
-    # 1) Numba bridge
-    try:
-        from numba import cuda as _cuda
-
-        dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False)
-        return torch.from_dlpack(dev_array)
-    except Exception:
-        pass
-
-    # 2) CuPy bridge
-    try:
-        import cupy as cp
-
-        class _cai_wrapper:
-            def __init__(self, d):
-                self.__cuda_array_interface__ = d
-
-        cp_arr = cp.asarray(_cai_wrapper(cai))
-        return torch.from_dlpack(cp_arr)
-    except Exception as e:
-        raise RuntimeError(
-            "Could not convert __cuda_array_interface__ to torch.Tensor. "
-            "Install numba or cupy (or expose a DLPack capsule natively)."
-        ) from e
+    dev_array = _cuda.from_cuda_array_interface(cai, owner=None, sync=False)
+    return torch.as_tensor(dev_array)

From 09913dc3f8f706919d7c0dda8c66fd639ef3b8e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Wed, 26 Nov 2025 13:06:55 +0100
Subject: [PATCH 220/221] Code simplification

---
 .../cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index 31c499ecbe4..c5b1e5222b1 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -196,19 +196,15 @@ cdef class logical_data:
 
             # Handle vector types (e.g., wp.vec2, wp.vec3)
             # Use structured dtype from descr if available
-            if typestr.startswith('|V'):
-                # Vector/structured type - use descr field if available
-                if 'descr' in cai:
-                    self._dtype = np.dtype(cai['descr'])
-                    self._shape = original_shape
-                else:
-                    # No descr field - treat as opaque bytes
-                    self._dtype = np.dtype(typestr)
-                    self._shape = original_shape
+            if typestr.startswith('|V') and 'descr' in cai:
+                # Vector/structured type - use descr field
+                self._dtype = np.dtype(cai['descr'])
             else:
-                # Regular scalar type
+                # Regular scalar type or vector without descr - use typestr
                 self._dtype = np.dtype(typestr)
-                self._shape = original_shape
+
+            # Shape is always the same regardless of type
+            self._shape = original_shape
 
             self._ndim = len(self._shape)
 
@@ -624,7 +620,7 @@ cdef class context:
                 stf_ctx_create(&self._ctx)
 
     cdef borrow_from_handle(self, stf_ctx_handle ctx_handle):
-        if not self._ctx == NULL:
+        if self._ctx != NULL:
             raise RuntimeError("context already initialized")
 
         if not self._borrowed:

From 237b2c1380aae1e9d3cd25ca8e265f9d2652a2b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <caugonnet@nvidia.com>
Date: Tue, 16 Dec 2025 15:36:47 +0100
Subject: [PATCH 221/221] minor fixes

---
 python/cuda_cccl/CMakeLists.txt                  | 3 +--
 python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index ebabc475330..bcfb3b084f9 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -108,7 +108,6 @@ endif()
 set(CYTHON_FLAGS "-3 -M -t -w \"${cuda_cccl_SOURCE_DIR}\"")
 string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
 
-# Only building STF bindings - parallel bindings not needed
 message(STATUS "Using Cython ${CYTHON_VERSION}")
 set(pyx_source_file "${cuda_cccl_SOURCE_DIR}/cuda/compute/_bindings_impl.pyx")
 
@@ -199,5 +198,5 @@ set_target_properties(
   PROPERTIES INSTALL_RPATH "$ORIGIN/cccl"
 )
 
-install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR})
 install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR})
+install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR})
diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
index c5b1e5222b1..12f8fba3114 100644
--- a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -506,7 +506,7 @@ cdef class task:
 
     def get_arg_cai(self, index):
         ptr = self.get_arg(index)
-        return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=0).__cuda_array_interface__
+        return stf_arg_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=self.stream_ptr()).__cuda_array_interface__
 
     def get_arg_numba(self, index):
         cai = self.get_arg_cai(index)