Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
279 commits
Select commit Hold shift + click to select a range
968995b
expose start/end and task type in context
caugonnet Jul 19, 2025
8cc6a3c
Save some WIP
caugonnet Jul 19, 2025
49064ee
Save some WIP
caugonnet Jul 20, 2025
f06f72e
Start to experiment with bindings for cuda_kernel
caugonnet Jul 20, 2025
08a22c9
Save WIP: we cannot directly use the pointer to a global function and…
caugonnet Jul 21, 2025
1482bbc
add missing variable
caugonnet Jul 21, 2025
f45d8da
Add missing finalize() call
caugonnet Jul 21, 2025
8b03c29
axpy example works with cuda_kernel in C
caugonnet Jul 21, 2025
749ca3b
check result
caugonnet Jul 21, 2025
4fb7000
Add ctx_token
caugonnet Jul 21, 2025
94757f8
make cudax usable in the python dir
caugonnet Jul 22, 2025
189e083
Save WIP with python
caugonnet Jul 23, 2025
3ed26ac
fix a typo
caugonnet Jul 23, 2025
1a5038a
fixed in python for stf
caugonnet Jul 23, 2025
0c1a2ae
Add a minimalistic Ctx class
caugonnet Jul 23, 2025
2f519e8
Fix installation paths
caugonnet Jul 23, 2025
469ff33
Add a dummy STF test
caugonnet Jul 23, 2025
9acc75b
Merge branch 'main' into stf_c_api
caugonnet Jul 23, 2025
c2a8fde
logical_data bindings
caugonnet Jul 24, 2025
3a39aa5
deps
caugonnet Jul 24, 2025
4491ff3
better task api
caugonnet Jul 24, 2025
74b430c
test with context managers
caugonnet Jul 24, 2025
479c24b
context task get_stream
caugonnet Jul 24, 2025
d428651
Fix python examples
caugonnet Jul 24, 2025
d3e2059
Merge branch 'main' into stf_c_api
caugonnet Jul 24, 2025
f7c7462
fix unused var
caugonnet Jul 24, 2025
17a31b9
Add const qualifiers
caugonnet Jul 24, 2025
3830634
Merge branch 'main' into stf_c_api
caugonnet Jul 24, 2025
aaf503f
NUMBA interop
caugonnet Jul 24, 2025
f2f7dfb
pre-commit
caugonnet Jul 24, 2025
97c5f3a
pre-commit
caugonnet Jul 24, 2025
a5d669d
pre-commit and stencil test
caugonnet Jul 24, 2025
686b988
make it possible to create a graph_ctx
caugonnet Jul 25, 2025
b4688fd
implement set_exec_place for cuda_kernel and unified tasks
caugonnet Aug 2, 2025
bd474d6
Define some execution places in the C API
caugonnet Aug 2, 2025
0b6e93a
WIP: start to support execution places
caugonnet Aug 2, 2025
ff9d70a
set_exec_place should also set the data place
caugonnet Aug 2, 2025
c610c42
rename ExecPlace to exec_place
caugonnet Aug 2, 2025
f65702b
Save WIP: start to implement data places (not compiling yet)
caugonnet Aug 3, 2025
21c94a6
fix data places
caugonnet Aug 3, 2025
f863ecd
Add data places in deps
caugonnet Aug 3, 2025
c419e1d
Merge branch 'main' into stf_c_api
caugonnet Aug 3, 2025
11b6673
test with places
caugonnet Aug 3, 2025
a1bff48
Merge branch 'main' into stf_c_api
caugonnet Aug 4, 2025
e422712
fix previous merge
caugonnet Aug 4, 2025
1bb8b43
typo fix
caugonnet Aug 4, 2025
fc8d5eb
Save WIP: try to implement a new decorator for STF
Aug 4, 2025
167f6c5
fix typo
Aug 4, 2025
95104ef
Defer compilation until we know types
caugonnet Aug 5, 2025
829f1c7
Merge branch 'main' into stf_c_api
caugonnet Aug 5, 2025
920f335
Add numba-cuda as a dependency
shwina Aug 6, 2025
587f33b
Replace use of pynvjitlink patch
shwina Aug 6, 2025
9db83a2
Update pyproject.toml
shwina Aug 6, 2025
865d337
better class name
caugonnet Aug 7, 2025
09dd965
Merge branch 'stf_c_api' of github.com:caugonnet/cccl into stf_c_api
caugonnet Aug 7, 2025
6899f3e
Merge branch 'main' into stf_c_api
caugonnet Aug 7, 2025
d051732
Merge remote-tracking branch 'shwina/add-numba-cuda-dep' into stf_c_api
caugonnet Aug 7, 2025
d223960
fixes to make cudastf.jit decorator work
caugonnet Aug 7, 2025
3eeda4b
Merge branch 'main' into stf_c_api
caugonnet Aug 7, 2025
15c2db0
revert some changes
caugonnet Aug 7, 2025
011e291
support tuple configs
caugonnet Aug 7, 2025
91e9d46
new test
caugonnet Aug 7, 2025
0ce9e68
Merge branch 'main' into stf_c_api
caugonnet Aug 8, 2025
faa4cb0
Merge branch 'main' into stf_c_api
caugonnet Aug 9, 2025
8be7401
Add a new test for places (C interface)
caugonnet Aug 9, 2025
a7da255
clang-format
caugonnet Aug 9, 2025
59c791b
Merge branch 'main' into stf_c_api
caugonnet Aug 10, 2025
086318c
Merge branch 'main' into stf_c_api
caugonnet Aug 11, 2025
1207ecd
Merge branch 'main' into stf_c_api
caugonnet Aug 13, 2025
6b86916
Merge branch 'main' into stf_c_api
caugonnet Aug 23, 2025
537b3b9
Skit test if we have less than 2 devices
Aug 25, 2025
d804d1b
Save WIP for like_empty (broken)
Aug 25, 2025
ad83a63
test with and witjout graphs
Aug 25, 2025
54a2181
Merge branch 'main' into stf_c_api
caugonnet Aug 25, 2025
f74c1d4
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Aug 26, 2025
95c88a3
remove unit test
caugonnet Aug 26, 2025
bc94c74
remove stf_logical_data_like_empty which is not designed properly yet
caugonnet Aug 26, 2025
765a519
Merge branch 'main' into stf_c_api
caugonnet Aug 27, 2025
3e47648
Add a missing header
caugonnet Aug 27, 2025
f208979
Install in a place that depends on cuda version
caugonnet Aug 27, 2025
2ca0e3d
fix pytest example
caugonnet Aug 27, 2025
7cff926
Try to use an intermediate "shim" module to import cu12 or cu13 versions
caugonnet Aug 27, 2025
c9c9672
Merge branch 'main' into stf_c_api
caugonnet Aug 27, 2025
b8d89ed
Fix tests (do not use graphs ...)
caugonnet Aug 27, 2025
740dc86
Introduce an API to enable graph capture with a low level graph_ctx task
caugonnet Aug 28, 2025
9687cbb
Enable graph capture when launching a numba kernel in the graph_ctx b…
caugonnet Aug 28, 2025
5246b65
Use a forked version of numba-cuda with work-arounds for CUDA graphs
caugonnet Aug 28, 2025
936bc60
fix formatting issues
caugonnet Aug 28, 2025
7689834
Do return a stream even in the graph_ctx when we are capturing
caugonnet Aug 28, 2025
dde406d
test with graphs
caugonnet Aug 28, 2025
7563014
parametrized tests
caugonnet Aug 28, 2025
ba4e9c3
Merge branch 'main' into stf_c_api
caugonnet Aug 28, 2025
b094c27
test that we get a stream in graph_task when capturing
caugonnet Aug 28, 2025
222c216
Save WIP: add a mockup of FHE example, which needs a like_empty method
caugonnet Aug 28, 2025
b04cebf
Implement like_empty
caugonnet Aug 28, 2025
9ed5ace
More comprehensive FHE test
caugonnet Aug 28, 2025
e27ef5b
test fhe with stf decorator
caugonnet Aug 28, 2025
d0f915e
Merge branch 'main' into stf_c_api
caugonnet Aug 28, 2025
6963ec0
fix merge error
caugonnet Aug 28, 2025
06fab11
Appropriate checks
caugonnet Aug 29, 2025
2fc802e
Add missing ;
caugonnet Aug 29, 2025
a43db62
- Make it possible to create a borrowed context from a handle
caugonnet Aug 29, 2025
9c07679
invert ctx and exec place in the decorator
caugonnet Aug 29, 2025
947bbcc
fix decorator api
caugonnet Aug 29, 2025
22b2d19
Add ciphertext.like_empty()
caugonnet Aug 29, 2025
66bcde3
Removing prints
caugonnet Aug 29, 2025
84534c8
do not import specific methods
caugonnet Aug 29, 2025
acf0cce
fix decorator api
caugonnet Aug 29, 2025
6a6e84f
Add a pytorch experiment
Aug 29, 2025
297a69b
more pytorch test
Aug 29, 2025
533ca5a
better interop with pytorch
Aug 29, 2025
9aa749f
remove useless pass
Aug 29, 2025
b11aa4b
tensor_arguments
Aug 29, 2025
0af151f
simpler code
Aug 29, 2025
746d308
pre-commit hooks
caugonnet Aug 29, 2025
d9195f5
try to remove dependency on torch and have adapters (WIP)
caugonnet Aug 31, 2025
f5ac828
remove unused code
caugonnet Aug 31, 2025
454a5da
cleanups
caugonnet Aug 31, 2025
ccfbb6b
fix numba adapter
caugonnet Aug 31, 2025
c6e7c07
skip torch test if torch is not available
caugonnet Aug 31, 2025
842a651
add dot vertex even in the low level api
caugonnet Aug 31, 2025
00c649c
fix types
caugonnet Aug 31, 2025
b0fc18d
pre-commit hooks
caugonnet Aug 31, 2025
3b257df
Merge branch 'main' into stf_c_api
caugonnet Aug 31, 2025
04cc07a
dot add_vertex is done in start() now
caugonnet Aug 31, 2025
bce25b8
Start to implement the FDTD example in pytorch
caugonnet Sep 1, 2025
d9c5f11
Start to port in STF version of pytorch
caugonnet Sep 1, 2025
70fa5d8
Adapt the FDTD example to use STF constructs and add methods to initi…
caugonnet Sep 1, 2025
5587a8d
format issue
caugonnet Sep 1, 2025
5ea5243
charset issue
caugonnet Sep 1, 2025
f7fbd34
rank agnostic method to init
caugonnet Sep 1, 2025
aec2d71
use .zero_() to blank fields
caugonnet Sep 1, 2025
eb71880
print values
caugonnet Sep 1, 2025
aaf6ec6
Experiment to display output as an image
caugonnet Sep 1, 2025
ae4c6d6
Use non blocking API
caugonnet Sep 2, 2025
9029fda
remove dead code
caugonnet Sep 2, 2025
ce7a33b
remove dead code
caugonnet Sep 2, 2025
cbde742
minor cleanup
caugonnet Sep 2, 2025
1936db6
Merge branch 'main' into stf_c_api
caugonnet Sep 2, 2025
c91e814
clang-format
caugonnet Sep 2, 2025
3fe6178
Add a C library for CUDASTF (to be used in the python bindings)
caugonnet Sep 2, 2025
666bd07
Merge branch 'main' into stf_c_lib
caugonnet Sep 2, 2025
522b630
remove dead code
caugonnet Sep 2, 2025
4315314
do define and use CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING
caugonnet Sep 2, 2025
48627aa
Add CUDASTF C lib to tests
caugonnet Sep 2, 2025
410aadd
Merge branch 'main' into stf_c_lib
caugonnet Sep 2, 2025
c87cdaa
Add missing headers
caugonnet Sep 2, 2025
02a9eb6
use snake_case
caugonnet Sep 2, 2025
232133b
Do define CCCL_C_EXPERIMENTAL=1
caugonnet Sep 2, 2025
b60eb6b
Do not do redundant tests
caugonnet Sep 2, 2025
c4c99f0
Add a project to ci/inspect_changes.sh
caugonnet Sep 2, 2025
2f5925b
missing changes in previous commit
caugonnet Sep 2, 2025
3417075
add presets
caugonnet Sep 2, 2025
8c05034
Add override matrix
alliepiper Sep 2, 2025
20faa8f
Properly define structs with a typedef and remove superfluous struct …
caugonnet Sep 3, 2025
d378f5a
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
8c5e760
fix previous merge
caugonnet Sep 3, 2025
78dc197
Change tensor_arguments to return an element instead of a tuple of on…
caugonnet Sep 3, 2025
2eb2ace
Remove intermediate structures and use opaque pointers instead
caugonnet Sep 3, 2025
6557067
Automatically generated documentation
caugonnet Sep 3, 2025
60266ff
Better implementation of the help to convert C places to the C++ API,…
caugonnet Sep 3, 2025
59f1983
Tell where to find cudax, and remove unnecessary libs
caugonnet Sep 3, 2025
c7fa9e6
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
97dd6f7
CCCL_ENABLE_C enables c/parallel, CCCL_ENABLE_C_EXPERIMENTAL_STF enab…
caugonnet Sep 3, 2025
1610f0b
Remove unnecessary definitions
caugonnet Sep 3, 2025
4383eaf
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
101fd0b
Merge branch 'main' into stf_c_lib
caugonnet Sep 4, 2025
4db210b
Merge branch 'main' into stf_c_lib
caugonnet Sep 5, 2025
90a8d20
use more consistent option names
caugonnet Sep 5, 2025
f2d7528
Merge branch 'main' into stf_c_lib
caugonnet Sep 9, 2025
ac667ca
Do not use [[maybe_unused]] for the C lib header because this is only…
caugonnet Sep 9, 2025
5bf62b3
Return an error code in stf_cuda_kernel_add_desc rather than use asse…
caugonnet Sep 9, 2025
c0a54f1
clang-format
caugonnet Sep 9, 2025
4573f9f
Merge branch 'main' into stf_c_lib
caugonnet Sep 9, 2025
abc58d8
Merge branch 'main' into stf_c_api
caugonnet Sep 9, 2025
af43da5
Merge stf_c_lib: Update c/ directory with complete C library implemen…
caugonnet Sep 9, 2025
c00c915
Revert Python linting changes
caugonnet Sep 9, 2025
cdd0d85
Fix Python CMakeLists.txt: Update C library feature flags
caugonnet Sep 9, 2025
afda29f
Fix Python build: Add missing CCCL_ENABLE_C master flag
caugonnet Sep 9, 2025
4f1f079
Complete STF C library configuration: Enable all C library features a…
caugonnet Sep 9, 2025
ccfc41d
Remove obsolete CCCL_ENABLE_C flag
caugonnet Sep 9, 2025
e4b8277
Update CMake configuration to match stf_c_lib structure
caugonnet Sep 9, 2025
6931fa8
Optimize Python build: Remove unnecessary C parallel library
caugonnet Sep 9, 2025
a1a1139
clang-format
caugonnet Sep 9, 2025
a3071f7
Merge branch 'stf_c_lib' into stf_c_api
caugonnet Sep 9, 2025
ecd9f4e
fix pytorch example
caugonnet Sep 9, 2025
4b2ae75
use ascii symbols
caugonnet Sep 9, 2025
5881081
Merge branch 'main' into stf_c_api
caugonnet Sep 9, 2025
4eef870
Merge branch 'main' into stf_c_api
caugonnet Sep 10, 2025
dcb3d39
Cleanup some changes in the infra from a previous merge
caugonnet Sep 10, 2025
1284eb2
Implement logical_data_empty logical_data_zeros, and logical_data_full
caugonnet Sep 10, 2025
0514f29
short names for torch.cuda
caugonnet Sep 10, 2025
5e9b4d5
Introduce pytorch_task
caugonnet Sep 10, 2025
53a4542
clang-format and some minor comment
caugonnet Sep 10, 2025
989f58b
Merge branch 'main' into stf_c_api
caugonnet Sep 17, 2025
93055c0
Merge branch 'main' into stf_c_api
caugonnet Sep 23, 2025
218fda2
make sure stf python tests are wrapped into functions so that pytest …
caugonnet Sep 25, 2025
1f97482
fix the return values of pytests
caugonnet Sep 25, 2025
1e482a4
Merge branch 'main' into stf_c_api
caugonnet Sep 25, 2025
7a58d68
Start to experiment with Warp
caugonnet Sep 25, 2025
9fb1c26
logical_data in python are now initialized with a data place, and the…
caugonnet Sep 25, 2025
5c1d50e
Save WIP: add access modes
caugonnet Sep 25, 2025
9f31b1e
cleanups
caugonnet Sep 25, 2025
c0bb070
Save WIP
caugonnet Sep 25, 2025
7094dd5
Merge branch 'main' into stf_c_api
caugonnet Oct 7, 2025
76d78b4
Adopt to new python hierarchy
caugonnet Oct 8, 2025
e03b062
Merge branch 'main' into stf_c_api
caugonnet Oct 8, 2025
0c11b6a
fix errors in a previous merge
caugonnet Oct 8, 2025
f6c50e1
cuda.cccl.experimental.stf => cuda.stf
caugonnet Oct 8, 2025
efea184
Misc stf python tests improvements
caugonnet Oct 8, 2025
c0d3592
Save WIP on this warp example
caugonnet Oct 8, 2025
eba61eb
Add sanity checks to test the is_void_interface() API
caugonnet Oct 8, 2025
e17c261
support tokens in python
caugonnet Oct 8, 2025
ec9c955
remove debug print
caugonnet Oct 8, 2025
52f4823
python cholesky with cupy
caugonnet Oct 8, 2025
5a32881
improve cholesky example
caugonnet Oct 8, 2025
abd5778
POTRI and Cholesky
caugonnet Oct 9, 2025
80e1085
clang-format
caugonnet Oct 9, 2025
865cf7b
Merge branch 'main' into stf_c_api
caugonnet Oct 9, 2025
4c1551a
how changes to numba-cuda have been merged
caugonnet Oct 9, 2025
77d6af1
Merge branch 'main' into stf_c_api
caugonnet Nov 14, 2025
acc8f49
Merge branch 'main' into stf_c_api
andralex Nov 14, 2025
de333b2
Fix CI precommit
andralex Nov 14, 2025
3834c8f
Merge branch 'main' into stf_c_api
andralex Nov 15, 2025
9a5c265
no need for numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 anymore
caugonnet Nov 24, 2025
9932a24
Merge origin/main into stf_c_api
caugonnet Nov 24, 2025
e7e2adb
Our numba-cuda fix is part of 0.21.0
caugonnet Nov 24, 2025
39040a9
Minor doc fix
caugonnet Nov 25, 2025
8f27fa2
Ensure matplotlib is only used if available
caugonnet Nov 25, 2025
73ac963
Cleanup examples
caugonnet Nov 25, 2025
d90ed64
cmake fix
caugonnet Nov 25, 2025
eb77519
Cmake fixes (need extra cleanup)
caugonnet Nov 25, 2025
b38ff80
Work-around for lazy resource init during graph capture in cuda core
caugonnet Nov 25, 2025
0a3e667
Use a relaxed capture mode
caugonnet Nov 25, 2025
8642fdd
This work-around is not needed anymore with a relaxed capture mode
caugonnet Nov 25, 2025
2a75766
Merge branch 'main' into stf_c_api
caugonnet Nov 25, 2025
0f9865d
cleanup warp example
caugonnet Nov 25, 2025
6466347
Cleanups in the cython code for STF
caugonnet Nov 25, 2025
cfb2930
no need for math.prod for such a simple thing
caugonnet Nov 26, 2025
130ee2a
Simpler code to handle vector types
caugonnet Nov 26, 2025
4bb4d23
fix grid dimension
caugonnet Nov 26, 2025
b8c745e
Use from_dlpack
caugonnet Nov 26, 2025
fb2a3ba
Change the mock-up FHE toy example to have operations that are homomo…
caugonnet Nov 26, 2025
6c2f850
Merge branch 'main' into stf_c_api
caugonnet Nov 26, 2025
da2e1aa
Add some explanation for the use of a relaxed capture mode
caugonnet Nov 26, 2025
852b400
cleaner pytorch adapter
caugonnet Nov 26, 2025
9308af5
Merge branch 'main' into stf_c_api
caugonnet Nov 27, 2025
09913dc
Code simplification
caugonnet Nov 26, 2025
237b2c1
minor fixes
caugonnet Dec 16, 2025
dd6cc26
Merge branch 'main' into stf_c_api
caugonnet Feb 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 59 additions & 8 deletions c/experimental/stf/include/cccl/c/experimental/stf/stf.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,36 +456,87 @@ cudaStream_t stf_fence(stf_ctx_handle ctx);
//!
//! \brief Create logical data from existing memory buffer
//!
//! Creates logical data handle from an existing host memory buffer.
//! STF takes ownership of data management during task execution.
//! Creates logical data handle from existing memory buffer, assuming host data place.
//! This is a convenience wrapper around stf_logical_data_with_place() with host placement.
//!
//! \param ctx Context handle
//! \param[out] ld Pointer to receive logical data handle
//! \param addr Pointer to existing data buffer
//! \param addr Pointer to existing data buffer (assumed to be host memory)
//! \param sz Size of data in bytes
//!
//! \pre ctx must be valid context handle
//! \pre ld must not be NULL
//! \pre addr must not be NULL
//! \pre addr must not be NULL and point to host-accessible memory
//! \pre sz must be greater than 0
//! \post *ld contains valid logical data handle
//!
//! \note Original data pointer should not be accessed during task execution
//! \note Data will be written back when logical data is destroyed or context finalized
//! \note This function assumes host memory. For device/managed memory, use stf_logical_data_with_place()
//! \note Equivalent to: stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place())
//!
//! \par Example:
//! \code
//! float data[1024];
//! stf_logical_data_handle ld;
//! stf_logical_data(ctx, &ld, data, sizeof(data));
//! stf_logical_data(ctx, &ld, data, sizeof(data)); // Assumes host memory
//! // ... use in tasks ...
//! stf_logical_data_destroy(ld);
//! \endcode
//!
//! \see stf_logical_data_empty(), stf_logical_data_destroy()
//! \see stf_logical_data_with_place(), stf_logical_data_empty(), stf_logical_data_destroy()

void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz);

//!
//! \brief Create logical data handle from address with data place specification
//!
//! Creates logical data handle from existing memory buffer, explicitly specifying where
//! the memory is located (host, device, managed, etc.). This is the primary and recommended
//! logical data creation function as it provides STF with essential memory location information
//! for optimal data movement and placement strategies.
//!
//! \param ctx Context handle
//! \param[out] ld Pointer to receive logical data handle
//! \param addr Pointer to existing memory buffer
//! \param sz Size of buffer in bytes
//! \param dplace Data place specifying memory location
//!
//! \pre ctx must be valid context handle
//! \pre ld must be valid pointer to logical data handle pointer
//! \pre addr must point to valid memory of at least sz bytes
//! \pre sz must be greater than 0
//! \pre dplace must be valid data place (not invalid)
//!
//! \post *ld contains valid logical data handle on success
//! \post Caller owns returned handle (must call stf_logical_data_destroy())
//!
//! \par Examples:
//! \code
//! // GPU device memory (recommended for CUDA arrays)
//! float* device_ptr;
//! cudaMalloc(&device_ptr, 1000 * sizeof(float));
//! stf_data_place dplace = make_device_data_place(0);
//! stf_logical_data_handle ld;
//! stf_logical_data_with_place(ctx, &ld, device_ptr, 1000 * sizeof(float), dplace);
//!
//! // Host memory
//! float* host_data = new float[1000];
//! stf_data_place host_place = make_host_data_place();
//! stf_logical_data_handle ld_host;
//! stf_logical_data_with_place(ctx, &ld_host, host_data, 1000 * sizeof(float), host_place);
//!
//! // Managed memory
//! float* managed_ptr;
//! cudaMallocManaged(&managed_ptr, 1000 * sizeof(float));
//! stf_data_place managed_place = make_managed_data_place();
//! stf_logical_data_handle ld_managed;
//! stf_logical_data_with_place(ctx, &ld_managed, managed_ptr, 1000 * sizeof(float), managed_place);
//! \endcode
//!
//! \see make_device_data_place(), make_host_data_place(), make_managed_data_place()

void stf_logical_data_with_place(
stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace);

//!
//! \brief Set symbolic name for logical data
//!
Expand Down
34 changes: 33 additions & 1 deletion c/experimental/stf/src/stf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,44 @@ cudaStream_t stf_fence(stf_ctx_handle ctx)
}

void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
{
// Convenience wrapper: assume host memory
stf_logical_data_with_place(ctx, ld, addr, sz, make_host_data_place());
}

void stf_logical_data_with_place(
stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace)
{
assert(ctx);
assert(ld);

auto* context_ptr = static_cast<context*>(ctx);
auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz));

// Convert C data_place to C++ data_place
cuda::experimental::stf::data_place cpp_dplace;
switch (dplace.kind)
{
case STF_DATA_PLACE_HOST:
cpp_dplace = cuda::experimental::stf::data_place::host();
break;
case STF_DATA_PLACE_DEVICE:
cpp_dplace = cuda::experimental::stf::data_place::device(dplace.u.device.dev_id);
break;
case STF_DATA_PLACE_MANAGED:
cpp_dplace = cuda::experimental::stf::data_place::managed();
break;
case STF_DATA_PLACE_AFFINE:
cpp_dplace = cuda::experimental::stf::data_place::affine();
break;
default:
// Invalid data place - this should not happen with valid input
assert(false && "Invalid data_place kind");
cpp_dplace = cuda::experimental::stf::data_place::host(); // fallback
break;
}

// Create logical data with the specified data place
auto ld_typed = context_ptr->logical_data(make_slice((char*) addr, sz), cpp_dplace);

// Store the logical_data_untyped directly as opaque pointer
*ld = new logical_data_untyped{ld_typed};
Expand Down
4 changes: 4 additions & 0 deletions cudax/examples/stf/void_data_interface.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,9 @@ int main()
return cuda_kernel_desc{dummy_kernel, 16, 128, 0};
};

EXPECT(token.is_void_interface());
EXPECT(token2.is_void_interface());
EXPECT(token3.is_void_interface());

ctx.finalize();
}
12 changes: 9 additions & 3 deletions cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ public:
{
// Select a stream from the pool
capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
// Use relaxed capture mode to allow capturing workloads that lazily initialize
// resources (e.g., set up memory pools)
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
}

auto& dot = *ctx.get_dot();
Expand Down Expand Up @@ -365,7 +367,9 @@ public:
capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;

cudaGraph_t childGraph = nullptr;
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
// Use relaxed capture mode to allow capturing workloads that lazily initialize
// resources (e.g., set up memory pools)
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));

// Launch the user provided function
f(capture_stream);
Expand Down Expand Up @@ -625,7 +629,9 @@ public:
cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;

cudaGraph_t childGraph = nullptr;
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
// Use relaxed capture mode to allow capturing workloads that lazily initialize
// resources (e.g., set up memory pools)
cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));

// Launch the user provided function
if constexpr (fun_invocable_stream_deps)
Expand Down
3 changes: 2 additions & 1 deletion cudax/test/stf/cpp/task_get_stream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ void test_stream()
context ctx;

auto token = ctx.token();
auto t = ctx.task(token.write());
EXPECT(token.is_void_interface());
auto t = ctx.task(token.write());
t.start();
cudaStream_t s = t.get_stream();
EXPECT(s != nullptr);
Expand Down
76 changes: 75 additions & 1 deletion python/cuda_cccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,27 @@ message(

# Build cccl.c.parallel and add CCCL's install rules
set(_cccl_root ../..)

# Build and install C++ library first
set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds
set(CCCL_ENABLE_C_PARALLEL ON) # Build the cccl.c.parallel library
set(CCCL_ENABLE_C_PARALLEL ON)
set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) # Enable C experimental STF library (triggers c/ directory)
set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should only be for STF ?


# Disable all testing, examples, and benchmarks - we only want the libraries
set(CCCL_ENABLE_TESTING OFF)
set(CCCL_ENABLE_EXAMPLES OFF)
set(CCCL_ENABLE_BENCHMARKS OFF)
set(CCCL_C_PARALLEL_ENABLE_TESTING OFF)
set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF)
# Note: CCCL_ENABLE_CUDAX must be ON because STF depends on it (via CCCL_ENABLE_UNSTABLE)
# But disable cudax tests, examples, and header testing
set(cudax_ENABLE_TESTING OFF)
set(cudax_ENABLE_EXAMPLES OFF)
set(cudax_ENABLE_HEADER_TESTING OFF)
set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})

# Just install the rest:
set(libcudacxx_ENABLE_INSTALL_RULES ON)
set(CUB_ENABLE_INSTALL_RULES ON)
Expand All @@ -34,10 +52,21 @@ add_subdirectory(${_cccl_root} _parent_cccl)
set(CMAKE_INSTALL_LIBDIR "${old_libdir}") # pop
set(CMAKE_INSTALL_INCLUDEDIR "${old_includedir}") # pop

# Create CCCL::cudax alias for STF (normally created by cccl-config.cmake)
if (TARGET cudax::cudax AND NOT TARGET CCCL::cudax)
add_library(CCCL::cudax ALIAS cudax::cudax)
endif()

# ensure the destination directory exists
file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl")
file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl")

# Install version-specific binaries
install(
TARGETS cccl.c.experimental.stf
DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl
)

install(
TARGETS cccl.c.parallel
DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl
Expand Down Expand Up @@ -110,12 +139,40 @@ add_custom_target(
DEPENDS "${_generated_extension_src}"
)

message(STATUS "STF Using Cython ${CYTHON_VERSION}")
set(
stf_pyx_source_file
"${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx"
)
set(_stf_generated_extension_src "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c")
set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep")
add_custom_command(
OUTPUT "${_stf_generated_extension_src}"
COMMAND "${Python3_EXECUTABLE}" -m cython
ARGS
${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file
${_stf_generated_extension_src}
DEPENDS "${stf_pyx_source_file}"
DEPFILE "${_stf_depfile}"
COMMENT "Cythonizing ${pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
)
set_source_files_properties(
"${_stf_generated_extension_src}"
PROPERTIES GENERATED TRUE
)
add_custom_target(
cythonize_stf_bindings_impl
ALL
DEPENDS "${_stf_generated_extension_src}"
)

python3_add_library(
_bindings_impl
MODULE
WITH_SOABI
"${_generated_extension_src}"
)

add_dependencies(_bindings_impl cythonize_bindings_impl)
target_link_libraries(
_bindings_impl
Expand All @@ -125,4 +182,21 @@ target_link_libraries(
)
set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")

python3_add_library(
_stf_bindings_impl
MODULE
WITH_SOABI
"${_stf_generated_extension_src}"
)
add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl)
target_link_libraries(
_stf_bindings_impl
PRIVATE cccl.c.experimental.stf CUDA::cuda_driver
)
set_target_properties(
_stf_bindings_impl
PROPERTIES INSTALL_RPATH "$ORIGIN/cccl"
)

install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR})
install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR})
27 changes: 27 additions & 0 deletions python/cuda_cccl/cuda/stf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from ._stf_bindings import (
context,
data_place,
dep,
exec_place,
)
from .decorator import jit # Python-side kernel launcher

__all__ = [
"context",
"dep",
"exec_place",
"data_place",
"jit",
]


def has_torch() -> bool:
import importlib.util

return importlib.util.find_spec("torch") is not None


def has_numba() -> bool:
import importlib.util

return importlib.util.find_spec("numba") is not None
4 changes: 4 additions & 0 deletions python/cuda_cccl/cuda/stf/_adapters/numba_bridge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
def cai_to_numba(cai: dict):
from numba import cuda

return cuda.from_cuda_array_interface(cai, owner=None, sync=False)
Loading
Loading