diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 3c1e874238a..98e82c95bce 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -55,6 +55,7 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 + - numba-cuda >=0.14.0,<0.15.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - rmm =${{ minor_version }} diff --git a/dependencies.yaml b/dependencies.yaml index 35a5fbf810c..8ae57628852 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -157,6 +157,7 @@ files: includes: - build_base - build_python_common + - build_python_cudf - depends_on_pylibcudf - depends_on_libcudf - depends_on_librmm @@ -500,6 +501,11 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cython>=3.0.3 + build_python_cudf: + common: + - output_types: [conda, requirements, pyproject] + packages: + - &numba-cuda-dep numba-cuda>=0.14.0,<0.15.0a0 pyarrow_run: common: - output_types: [conda] @@ -673,7 +679,7 @@ dependencies: - typing_extensions>=4.0.0 - output_types: [conda] packages: - - &numba-cuda-dep numba-cuda>=0.14.0,<0.15.0a0 + - *numba-cuda-dep - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md index a1cc1c9d586..de6a5678329 100644 --- a/docs/cudf/source/developer_guide/index.md +++ b/docs/cudf/source/developer_guide/index.md @@ -27,4 +27,5 @@ testing benchmarking options cudf_pandas +udf_memory_management ``` diff --git a/docs/cudf/source/developer_guide/udf_memory_management.md b/docs/cudf/source/developer_guide/udf_memory_management.md new file mode 100644 index 00000000000..264c36ee02b --- /dev/null +++ b/docs/cudf/source/developer_guide/udf_memory_management.md @@ -0,0 +1,402 @@ +# String UDF memory management + +Inside UDFs, some string methods like ``concat()`` and ``replace()`` produce new +strings. For a CUDA thread to create a new string, it must dynamically allocate +memory on the device to hold the string's data. The cleanup of this memory by the +thread later on must preserve Python's semantics, for example when the variable +corresponding to the new string goes out of scope. To accomplish this in cuDF, UDF +memory management (allocation and freeing of the underlying data) is handled +transparently for the user, via a reference counting mechanism. This reference +counting implementation is distinct from the one in python and has its own interface +and requirements, + +Along with the code generated from the functions and operations within the passed UDF, +numba-cuda will automatically weave the necessary reference counting operations into +the final device function that each thread will ultimately run. This allows the +programmer to pass a UDF that may utilize memory allocating types such strings +generally as one would in python: + +```python +def udf(string): + if len(string) > 2: + result = string.upper() # new allocation + else: + result = string + string # new allocation + return result + 'abc' +``` + + +## Numba memory management and the Numba Runtime (NRT) + +The API functions used to update the reference count associated with a variable +derive from [Numba's memory management for nopython mode +code](https://numba.readthedocs.io/en/stable/developer/numba-runtime.html#memory-management). +This runtime library (NRT or Numba Runtime) provides implementations for operators +that increase and decrease a variable's reference count (INCREF/DECREF), and numba +analyzes the passed UDF to determine where the calls targeting these implementations +should go and what objects they should operate on. Below are some examples of situations +where numba-cuda would detect a reference counting operation needs to be applied to +an object: + +- **The creation of a new object**: During object creation, memory is allocated +and a structure to track the memory is created and initialized. +- **When new references are created**: For example during assignments, the +reference count of the assigned-from object is incremented. +- **When references are destroyed**: For example when an object goes out of +scope, or when an object holding a reference is destroyed. During these +events, the reference count of the tracked object is decremented. If the +reference count of an object falls to zero, the Numba Runtime will invoke its destructor. +- **When an intermediate variable is no longer needed**: For example when creating +a new variable for inspection then disposing of it, as in `string.upper() == 'A'` + + +Numba does not reference count every variable, as only variables with an associated +heap memory allocation need to be tracked. Numba determines if this is true for a +variable during compilation by querying the properties of the datamodel underlying +the variable's type. We provide a string type ``ManagedUDFString`` that implements +the required properties and backs any new string that is created on the device. Its +datamodel is defined under the data structures section below and is registered to +the extension type as shown. + + +## Data structures +The core concept is a ``ManagedUDFString`` numba extension type that fulfills the +requirements to be reference counted by NRT. It is composed of a `cudf::udf_string` +that owns the string data and a pointer to a ``MemInfo`` object, which the NRT API +uses for reference counting. + +```python + +from cudf.core.udf.strings_typing import ManagedUDFString +from numba.cuda.descriptor import cuda_target + +@register_model(ManagedUDFString) +class managed_udf_string_model(models.StructModel): + _members = (("meminfo", types.voidptr), ("udf_string", udf_string)) + + def __init__(self, dmm, fe_type): + super().__init__(dmm, fe_type, self._members) + + def has_nrt_meminfo(self): + return True + + def get_nrt_meminfo(self, builder, value): + # effectively returns self.meminfo in IR form + udf_str_and_meminfo = numba.core.cgutils.create_struct_proxy(ManagedUDFString())( + cuda_target.target_context, builder, value=value + ) + return udf_str_and_meminfo.meminfo +``` + +The actual NRT APIs for adjusting the reference count of an object expect to operate +on this ``MemInfo`` object itself rather than the instance: + +```c++ +extern "C" +struct MemInfo { +cuda::atomic refct; +NRT_dtor_function dtor; +void* dtor_info; +void* data; +size_t size; +}; +typedef struct MemInfo NRT_MemInfo; +``` + +Every instance of a reference counted type within the scope of a CUDA thread executing +the UDF is associated with a separate instance of this ``MemInfo`` struct. An INCREF or +DECREF on the instance in numba's intermediate representation formed during compilation +will resolve to an increase or decrease of the `refct` of the ``MemInfo`` associated +with that instance. The NRT_decref implementation calls the ``dtor`` on the ``data`` if +the ``refct`` is found to be zero: + + +```c++ +extern "C" __device__ void NRT_decref(NRT_MemInfo* mi) +{ + if (mi != NULL) { + mi->refct--; + if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); } + } +} +``` + +## NRT Requirements + +For a type to participate in Numba's reference counting correctly, the following must be +true: + +1. The datamodel for the type needs to report that it has a meminfo. This is done by + returning `True` from `has_nrt_meminfo`. +2. The datamodel must expose the location of the meminfo for that instance to numba's + lowering phase. This means implementing `get_nrt_meminfo()` such that it returns the + meminfo in a predictable location in heap memory. +3. Operators or functions that return the type must initialize the meminfo and place it + at the location numba will report it exists at through (2). This is done in the lowering + for the operations we support, such as `concat`. + +``ManagedUDFString`` fulfills (2) by tying the MemInfo and the string instance that it owns +together into a parent struct. This allows (2) to be implemented by just returning its own +`.meminfo` member, effectively relating the meminfo location to `self` via an offset. +Lowering for operations like `concat` populate this member before returning. + + +### cuDF string data structures + +On the C++ side, libcudf permits storing entire columns of strings. The +``cudf::string_view`` class is a non-owning view of a string --- usually a +single row in a libcudf column --- that provides a convenient abstraction +over working with individual strings in device code, for example in custom +kernels. cuDF Python introduces the ``cudf::strings::udf::udf_string`` class, +an owning container around a single string. This class is used by the numba UDF +code to create new strings in device code. All libcudf string functions are made +available in cuDF Python UDFs by constructing ``cudf::string_view`` instances +that view the strings owned by ``udf_string`` instances. + +The cuDF extensions to Numba generate code to manipulate instances of these +classes, so we outline the members of these classes to aid in understanding +them. These classes also have various methods; consult the [cuDF C++ Developer +Documentation for further details of these structures.](https://docs.rapids.ai/api/libcudf/stable/developer_guide) + +```c++ +class string_view { + // A pointer to the underlying string data + char const* p{}; + // The length of the underlying string data in bytes + size_type bytes{}; + // The offset into the underlying string data in characters + size_type char_pos{}; + // The offset into the underlying string data in bytes + size_type byte_pos{}; +}; +``` + +```c++ +class udf_string { + // A pointer to the underlying string data + char* m_data{}; + // The length of the string data in bytes + cudf::size_type m_bytes{}; + // The size of the underlying allocation in bytes + cudf::size_type m_capacity{}; +}; +``` + +```{note} +A ``udf_string`` has a destructor that frees the underlying string data. This is +important, because the C++ destructor is invoked during destruction of a +Python-side Managed UDF String object. +``` + + +## Implementation + +The cuDF implementations for Managed UDF Strings is required to provide: + +- Typing and lowering for Managed UDF String operations. The typing has no special properties; it is similar to any other typing implementation in a Numba extension. The lowering is required to ensure that ``NRT_MemInfo`` objects for each managed object are created and initialized correctly. +- C++ implementations of string functions, some of which use libcudf's C++ string functionality. Other functions are provided by the ``strings_udf`` C++ library in cuDF Python. These help with the allocation of data and implement the required destructors. +- Numba shim functions to adapt calls to C++ code for use in Numba code and + Numba extensions are also required. +- Conversion from String UDF data to and from `cudf::column`. + +Use of C++ code for string functionality is not a hard requirement for +implementing string support in a Numba extension - it is instead a pragmatic +choice so that the Python and C++ sides of cuDF can share a single implementation +for string operations instead of trying to keep two separately-maintained +implementations in sync. + + +The majority of the complexity in the implementation comes from two areas: + +- Combining the requirement to use C++ implementations, with the need to provide +correct initialization of `NRT_MemInfo` object, and +- Conversion of Managed UDF String objects back into cuDF columns when a UDF +returns strings. + + +## String Lifecycle Details + +Let's trace the complete lifecycle of a string created by `result = str1 + str2` in a UDF: + +### Phase 1: Compilation + +**1.1 Numba Analysis** +```python +# User UDF +def my_udf(str1, str2): + result = str1 + str2 + return result +``` + +- Typing phase identifies `str1 + str2` as returning a `ManagedUDFString` +- Lowering phase begins for the `+` operator + +**1.2 Stack Allocation** +```python +managed_ptr = builder.alloca( + context.data_model_manager[managed_udf_string].get_value_type() +) +``` +- Allocates stack space for the complete `ManagedUDFString` instance +- At this point, both fields are uninitialized + +**1.3 Member Pointer Extraction** +```python +udf_str_ptr = builder.gep(managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)]) +``` +- Gets pointer to the `udf_string` member within the allocated struct + +### Phase 2: String Creation via Shim Function + +**2.1 Shim Function Call** +```python +meminfo = context.compile_internal( + builder, call_concat_string_view, + types.voidptr(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + (udf_str_ptr, lhs_ptr, rhs_ptr) +) +``` + +**2.2 Inside the Shim Function** +```c++ +extern "C" __device__ int concat_shim(void** out_meminfo, + void* output_udf_str, + void* const* lhs, + void* const* rhs) { + auto lhs_sv = reinterpret_cast(lhs); + auto rhs_sv = reinterpret_cast(rhs); + + // Perform actual concat- allocates GPU memory for result + auto result_str = cudf::strings::udf::concat(*lhs_sv, *rhs_sv); + + // Place result into pre-allocated stack space using placement new + auto udf_str_ptr = new (output_udf_str) udf_string(std::move(result_str)); + + // Create and return the meminfo + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); + + return 0; +} +``` + +In the above, critically the final string is constructed through placement +new which relieves the compiler of the responsibility for cleaning up the +`cudf::udf_string` created there. + +**2.3 MemInfo Creation Details** +```c++ +__device__ NRT_MemInfo* make_meminfo_for_new_udf_string(udf_string* udf_str) { + struct mi_str_allocation { + NRT_MemInfo mi; + udf_string st; + }; + + // Single heap allocation for both structures + mi_str_allocation* heap_allocation = (mi_str_allocation*)NRT_Allocate(sizeof(mi_str_allocation)); + + NRT_MemInfo* mi_ptr = &(heap_allocation->mi); + udf_string* heap_str_ptr = &(heap_allocation->st); + + // Initialize MemInfo pointing to co-allocated string + NRT_MemInfo_init(mi_ptr, heap_str_ptr, 0, udf_str_dtor, NULL); + + // Copy string data to heap location + memcpy(heap_str_ptr, udf_str, sizeof(udf_string)); + + return mi_ptr; +} +``` + +`mi_str_allocation` is similar in structure to `ManagedUDFString` but has +a `MemInfo` struct value as its first member rather than a pointer. + +### Phase 3: Object Assembly and Return + +**3.1 Final Assembly** +```python +managed = cgutils.create_struct_proxy(managed_udf_string)(context, builder) +managed.meminfo = meminfo # Points to heap MemInfo +return managed._getvalue() +``` + +**3.2 Current Memory State** +- **Stack**: `ManagedUDFString` struct with valid `meminfo` pointer and `udf_string` data +- **Heap**: Co-allocated MemInfo and udf_string structures +- **GPU Memory**: String data owned by heap-allocated udf_string +- **Reference Count**: 1 (object just created) + +### Phase 4: Runtime Usage and Reference Management + +**4.1 Assignment Operations** + +Within the broader kernel being launched, the result of the overall UDF is +assigned: + +```python +result = my_udf(input_string) +``` + +At this point, `result` is a fully initialized `ManagedUDFString`: + +- Numba detects assignment of reference counted return value +- Automatically inserts `NRT_incref(managed.meminfo)` +- `heap_allocation->mi.refct` becomes 2 +- passed_udf exits, causing an `NRT_decref(managed.meminfo)`. + +**4.2 Setitem into the final array** + +The final line of the containing kernel sets the result into the output +array: + +``` +output_string_ary[tid] = result +``` + +- Adds an incref, bumping the refcount back up to 2. + + +### Phase 5: Destruction Sequence + +**5.1 Final Reference Release** + +The kernel being launched is ultimately overall a `void` function. Any +variables contained locally therein will be decref'd at function's exit, +like any other function. + +- `result` variable decref'd, but still referred to by the output array +- `heap_allocation->mi.refct` becomes 1 + + +**5.2 Destructor Execution** + +The function `column_from_managed_udf_string_array` creates a `cudf::column` +from the output buffer containing the strings. cuDF launches a freeing kernel +that decrefs all the result strings one last time: + +```python +def free_managed_udf_string_array(ary, size): + gid = cuda.grid(1) + if gid < size: + NRT_decref(ary[gid]) +``` + +- `NRT_MemInfo_call_dtor` invokes the destructor for the object + +```c++ +__device__ void udf_str_dtor(void* udf_str, size_t size, void* dtor_info) { + auto ptr = reinterpret_cast(udf_str); + ptr->~udf_string(); +} +``` + +- A `MemInfo` dies after invoking its destructor - the NRT API ensures that +once this is done, the originally `NRT_Allocat`ed pointer is freed. This has +the effect of freeing the entire `mi_str_allocation`. + + +**5.3 Final Memory State** +- **GPU String Memory**: Freed +- **Heap MemInfo Block**: Freed +- **Stack**: Original `ManagedUDFString` becomes invalid/out-of-scope +- **Reference Count**: N/A (object destroyed) +- **cuDF** A `cudf::column` of string type containing the result of the UDF diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index 6010969192c..431930664de 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -7,13 +7,12 @@ from libcpp.utility cimport move from pylibcudf cimport Column as plc_Column from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.strings_udf cimport ( - column_from_udf_string_array as cpp_column_from_udf_string_array, - free_udf_string_array as cpp_free_udf_string_array, + column_from_managed_udf_string_array as cpp_column_from_managed_udf_string_array, get_character_cases_table as cpp_get_character_cases_table, get_character_flags_table as cpp_get_character_flags_table, get_special_case_mapping_table as cpp_get_special_case_mapping_table, to_string_view_array as cpp_to_string_view_array, - udf_string, + managed_udf_string, ) from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer @@ -30,14 +29,13 @@ def column_to_string_view_array(plc_Column strings_col): return DeviceBuffer.c_from_unique_ptr(move(c_buffer)) -def column_from_udf_string_array(DeviceBuffer d_buffer): - cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string)) - cdef udf_string* data = d_buffer.c_data() +def column_from_managed_udf_string_array(DeviceBuffer d_buffer): + cdef size_t size = int(d_buffer.c_size() / sizeof(managed_udf_string)) + cdef managed_udf_string* data = d_buffer.c_data() cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_column_from_udf_string_array(data, size)) - cpp_free_udf_string_array(data, size) + c_result = move(cpp_column_from_managed_udf_string_array(data, size)) return plc_Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4178cecefc9..ad6c23b2988 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -58,6 +58,7 @@ from cudf.core.resample import _Resampler from cudf.core.udf.utils import ( _get_input_args_from_frame, + _make_free_string_kernel, _return_arr_from_dtype, ) from cudf.core.window import ExponentialMovingWindow, Rolling @@ -3448,8 +3449,11 @@ def _apply(self, func, kernel_class, *args, **kwargs): if retty == CUDF_STRING_DTYPE: col = ColumnBase.from_pylibcudf( - strings_udf.column_from_udf_string_array(ans_col) + strings_udf.column_from_managed_udf_string_array(ans_col) ) + free_kernel = _make_free_string_kernel() + with _CUDFNumbaConfig(): + free_kernel.forall(len(col))(ans_col, len(col)) else: col = as_column(ans_col, retty) diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py index ae09294e3f9..fb561cdc306 100644 --- a/python/cudf/cudf/core/udf/masked_lowering.py +++ b/python/cudf/cudf/core/udf/masked_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import operator @@ -23,6 +23,7 @@ NAType, _supported_masked_types, ) +from cudf.core.udf.strings_typing import managed_udf_string @cuda_lowering_registry.lower_constant(NAType) @@ -277,6 +278,13 @@ def masked_scalar_is_null_impl(context, builder, sig, args): # else packs it up into a new one that is valid from the get go @cuda_lower(api.pack_return, MaskedType) def pack_return_masked_impl(context, builder, sig, args): + # Must incref any managed object we return from + # a handwritten lowering function + if sig.args[0].value_type is managed_udf_string: + struct = cgutils.create_struct_proxy(MaskedType(managed_udf_string))( + context, builder, value=args[0] + ) + context.nrt.incref(builder, managed_udf_string, struct.value) return args[0] @@ -285,6 +293,13 @@ def pack_return_masked_impl(context, builder, sig, args): @cuda_lower(api.pack_return, types.NPDatetime) @cuda_lower(api.pack_return, types.NPTimedelta) def pack_return_scalar_impl(context, builder, sig, args): + # Must incref any managed object we return from + # a handwritten lowering function + if sig.args[0] is managed_udf_string: + string = cgutils.create_struct_proxy(MaskedType(managed_udf_string))( + context, builder, value=args[0] + ) + context.nrt.incref(builder, managed_udf_string, string) outdata = cgutils.create_struct_proxy(sig.return_type)(context, builder) outdata.value = args[0] outdata.valid = context.get_constant(types.boolean, 1) diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index 98d5af90ad9..220fea7b04f 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -31,16 +31,17 @@ ) from cudf.core.udf.nrt_utils import _current_nrt_context from cudf.core.udf.strings_typing import ( + ManagedUDFString, StringView, UDFString, bool_binary_funcs, id_unary_funcs, int_binary_funcs, + managed_udf_string, size_type, string_return_attrs, string_unary_funcs, string_view, - udf_string, ) from cudf.utils.dtypes import ( DATETIME_TYPES, @@ -63,7 +64,7 @@ | _datetime_cases | _timedelta_cases | {types.boolean} - | {string_view, udf_string} + | {string_view, managed_udf_string} ) @@ -74,6 +75,7 @@ types.NPTimedelta, StringView, UDFString, + ManagedUDFString, ) @@ -479,7 +481,9 @@ def generic(self, args, kws): def _is_valid_string_arg(ty): return ( isinstance(ty, MaskedType) - and isinstance(ty.value_type, (StringView, UDFString)) + and isinstance( + ty.value_type, (StringView, UDFString, ManagedUDFString) + ) ) or isinstance(ty, types.StringLiteral) @@ -514,7 +518,7 @@ def len_typing(self, args, kws): def concat_typing(self, args, kws): if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): return nb_signature( - MaskedType(udf_string), + MaskedType(managed_udf_string), MaskedType(string_view), MaskedType(string_view), ) @@ -610,7 +614,7 @@ class MaskedStringViewReplace(AbstractTemplate): def generic(self, args, kws): return nb_signature( - MaskedType(udf_string), + MaskedType(managed_udf_string), MaskedType(string_view), MaskedType(string_view), recvr=self.this, @@ -656,7 +660,7 @@ def resolve_valid(self, mod): setattr( MaskedStringViewAttrs, f"resolve_{func}", - create_masked_binary_attr(f"MaskedType.{func}", udf_string), + create_masked_binary_attr(f"MaskedType.{func}", managed_udf_string), ) for func in id_unary_funcs: @@ -670,16 +674,16 @@ def resolve_valid(self, mod): setattr( MaskedStringViewAttrs, f"resolve_{func}", - create_masked_unary_attr(f"MaskedType.{func}", udf_string), + create_masked_unary_attr(f"MaskedType.{func}", managed_udf_string), ) -class MaskedUDFStringAttrs(MaskedStringViewAttrs): - key = MaskedType(udf_string) +class MaskedManagedUDFStringAttrs(MaskedStringViewAttrs): + key = MaskedType(managed_udf_string) def resolve_value(self, mod): - return udf_string + return managed_udf_string cuda_decl_registry.register_attr(MaskedStringViewAttrs) -cuda_decl_registry.register_attr(MaskedUDFStringAttrs) +cuda_decl_registry.register_attr(MaskedManagedUDFStringAttrs) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index 3c02ee52b25..4cb755785e8 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -1,11 +1,11 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import operator from functools import partial +from llvmlite import ir from numba import cuda, types from numba.core import cgutils -from numba.core.datamodel import default_manager from numba.core.typing import signature as nb_signature from numba.cuda.cudaimpl import ( lower as cuda_lower, @@ -18,7 +18,13 @@ get_special_case_mapping_table_ptr, ) from cudf.core.udf.masked_typing import MaskedType -from cudf.core.udf.strings_typing import size_type, string_view, udf_string +from cudf.core.udf.strings_typing import ( + NRT_decref, + managed_udf_string, + size_type, + string_view, + udf_string, +) _STR_VIEW_PTR = types.CPointer(string_view) _UDF_STRING_PTR = types.CPointer(udf_string) @@ -27,13 +33,17 @@ # CUDA function declarations # read-only (input is a string_view, output is a fixed with type) _string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR)) + + _concat_string_view = cuda.declare_device( - "concat", types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) + "concat", types.voidptr(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) ) _string_view_replace = cuda.declare_device( "replace", - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + types.voidptr( + _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR + ), ) @@ -47,7 +57,7 @@ def _declare_binary_func(lhs, rhs, out, name): def _declare_strip_func(name): return cuda.declare_device( - name, size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) + name, types.voidptr(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) ) @@ -86,7 +96,7 @@ def _declare_strip_func(name): def _declare_upper_or_lower(func): return cuda.declare_device( func, - types.void( + types.voidptr( _UDF_STRING_PTR, _STR_VIEW_PTR, types.uintp, @@ -136,29 +146,52 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): return sv._getvalue() -@cuda_lowering_registry.lower_cast(string_view, udf_string) -def cast_string_view_to_udf_string(context, builder, fromty, toty, val): - sv_ptr = builder.alloca(default_manager[fromty].get_value_type()) - udf_str_ptr = builder.alloca(default_manager[toty].get_value_type()) +@cuda_lowering_registry.lower_cast(string_view, managed_udf_string) +def cast_string_view_to_managed_udf_string( + context, builder, fromty, toty, val +): + sv_ptr = builder.alloca( + context.data_model_manager[fromty].get_value_type() + ) + + managed_ptr = builder.alloca( + context.data_model_manager[managed_udf_string].get_value_type() + ) + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) + builder.store(val, sv_ptr) - _ = context.compile_internal( + meminfo = context.compile_internal( builder, call_create_udf_string_from_string_view, - nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)), + nb_signature(types.voidptr, _STR_VIEW_PTR, types.CPointer(udf_string)), (sv_ptr, udf_str_ptr), ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, + builder, + value=builder.load(managed_ptr), ) + managed.meminfo = meminfo + + return managed._getvalue() - return result._getvalue() +@cuda_lowering_registry.lower_cast(managed_udf_string, string_view) +def cast_managed_udf_string_to_string_view( + context, builder, fromty, toty, val +): + sv_ptr = builder.alloca(context.data_model_manager[toty].get_value_type()) -@cuda_lowering_registry.lower_cast(udf_string, string_view) -def cast_udf_string_to_string_view(context, builder, fromty, toty, val): - udf_str_ptr = builder.alloca(default_manager[fromty].get_value_type()) - sv_ptr = builder.alloca(default_manager[toty].get_value_type()) - builder.store(val, udf_str_ptr) + managed_ptr = builder.alloca( + context.data_model_manager[fromty].get_value_type() + ) + builder.store(val, managed_ptr) + + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) context.compile_internal( builder, @@ -166,15 +199,13 @@ def cast_udf_string_to_string_view(context, builder, fromty, toty, val): nb_signature(types.void, _UDF_STRING_PTR, _STR_VIEW_PTR), (udf_str_ptr, sv_ptr), ) - result = cgutils.create_struct_proxy(string_view)( context, builder, value=builder.load(sv_ptr) ) - return result._getvalue() -# utilities +# Utilities and Casts _create_udf_string_from_string_view = cuda.declare_device( "udf_string_from_string_view", types.void(_STR_VIEW_PTR, _UDF_STRING_PTR), @@ -193,6 +224,23 @@ def call_create_string_view_from_udf_string(udf_str, sv): _create_string_view_from_udf_string(udf_str, sv) +# Return string setitem impl with an extra incref +@cuda_lower( + operator.setitem, + types.CPointer(managed_udf_string), + types.Integer, + types.Any, +) +def setitem_cpointer_managed_udf_string(context, builder, sig, args): + base_ptr, idx, val = args + elem_ptr = builder.gep(base_ptr, [idx]) + builder.store(val, elem_ptr) + # Storing a Managed UDF String in a CPointer array effectively creates a + # new reference; represent this by incrementing the refcount of the source + # of the assignment + context.nrt.incref(builder, managed_udf_string, val) + + # String function implementations def call_len_string_view(st): return _string_view_len(st) @@ -212,6 +260,19 @@ def len_impl(context, builder, sig, args): return result +@cuda_lower(NRT_decref, managed_udf_string) +def decref_managed_udf_string(context, builder, sig, args): + managed_ptr = args[0] + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, builder, value=managed_ptr + ) + fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.IntType(8))]) + fn = cgutils.get_or_insert_function(builder.module, fnty, "NRT_decref") + builder.call(fn, (managed.meminfo,)) + + return + + def call_concat_string_view(result, lhs, rhs): return _concat_string_view(result, lhs, rhs) @@ -223,18 +284,29 @@ def concat_impl(context, builder, sig, args): builder.store(args[0], lhs_ptr) builder.store(args[1], rhs_ptr) - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) - _ = context.compile_internal( + managed_ptr = builder.alloca( + context.data_model_manager[managed_udf_string].get_value_type() + ) + + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) + + meminfo = context.compile_internal( builder, call_concat_string_view, - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + types.voidptr(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), (udf_str_ptr, lhs_ptr, rhs_ptr), ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, + builder, + value=builder.load(managed_ptr), ) - return result._getvalue() + managed.meminfo = meminfo + + return managed._getvalue() def call_string_view_replace(result, src, to_replace, replacement): @@ -242,7 +314,6 @@ def call_string_view_replace(result, src, to_replace, replacement): @cuda_lower("StringView.replace", string_view, string_view, string_view) -@cuda_lower("UDFString.replace", string_view, string_view, string_view) def replace_impl(context, builder, sig, args): src_ptr = builder.alloca(args[0].type) to_replace_ptr = builder.alloca(args[1].type) @@ -252,21 +323,28 @@ def replace_impl(context, builder, sig, args): builder.store(args[1], to_replace_ptr) builder.store(args[2], replacement_ptr) - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) + managed_ptr = builder.alloca( + context.data_model_manager[managed_udf_string].get_value_type() + ) + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) - _ = context.compile_internal( + meminfo = context.compile_internal( builder, call_string_view_replace, - types.void( + types.voidptr( _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR ), (udf_str_ptr, src_ptr, to_replace_ptr, replacement_ptr), ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, + builder, + value=builder.load(managed_ptr), ) - return result._getvalue() + managed.meminfo = meminfo + return managed._getvalue() def create_binary_string_func(binary_func, retty): @@ -305,19 +383,34 @@ def binary_func_impl(context, builder, sig, args): # value of compile_internal is therefore discarded (although # this may change in the future if we need to return error # codes, for instance). - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() + + managed_ptr = builder.alloca( + context.data_model_manager[ + managed_udf_string + ].get_value_type() ) - _ = context.compile_internal( + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) + + meminfo = context.compile_internal( builder, cuda_func, - size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + types.voidptr( + _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR + ), (udf_str_ptr, lhs_ptr, rhs_ptr), ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, + builder, + value=builder.load( + managed_ptr + ), # {i8*, {i8*, i32, i32}}* -> {i8*, {i8*, i32, i32}} ) - return result._getvalue() + managed.meminfo = meminfo + + return managed._getvalue() # binary_func can be attribute-like: str.binary_func # or operator-like: binary_func(str, other) @@ -326,7 +419,7 @@ def binary_func_impl(context, builder, sig, args): f"StringView.{binary_func}", string_view, string_view )(binary_func_impl) binary_func_impl = cuda_lower( - f"UDFString.{binary_func}", string_view, string_view + f"ManagedUDFString.{binary_func}", string_view, string_view )(binary_func_impl) else: binary_func_impl = cuda_lower( @@ -473,14 +566,17 @@ def id_func_impl(context, builder, sig, args): special_tbl_ptr = context.get_constant( types.uintp, get_special_case_mapping_table_ptr() ) - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) - _ = context.compile_internal( + managed_ptr = builder.alloca( + context.data_model_manager[managed_udf_string].get_value_type() + ) + udf_str_ptr = builder.gep( + managed_ptr, [ir.IntType(32)(0), ir.IntType(32)(1)] + ) + meminfo = context.compile_internal( builder, cuda_func, - types.void( + types.voidptr( _UDF_STRING_PTR, _STR_VIEW_PTR, types.uintp, @@ -495,11 +591,15 @@ def id_func_impl(context, builder, sig, args): special_tbl_ptr, ), ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) + managed = cgutils.create_struct_proxy(managed_udf_string)( + context, + builder, + value=builder.load( + managed_ptr + ), # {i8*, {i8*, i32, i32}}* -> {i8*, {i8*, i32, i32}} ) - return result._getvalue() + managed.meminfo = meminfo + return managed._getvalue() return id_func_impl diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index a0cbe7ada19..e065c0ad75d 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -1,13 +1,15 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import operator import numpy as np from numba import types +from numba.core import cgutils from numba.core.extending import models, register_model from numba.core.typing import signature as nb_signature from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.descriptor import cuda_target import rmm @@ -23,7 +25,18 @@ def __init__(self): super().__init__(name="udf_string") @property - def return_type(self): + def return_as(self): + return self + + +class ManagedUDFString(types.Type): + np_dtype = np.dtype("object") + + def __init__(self): + super().__init__(name="managed_udf_string") + + @property + def return_as(self): return self @@ -34,8 +47,8 @@ def __init__(self): super().__init__(name="string_view") @property - def return_type(self): - return UDFString() + def return_as(self): + return ManagedUDFString() @register_model(StringView) @@ -75,11 +88,31 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, self._members) -any_string_ty = (StringView, UDFString, types.StringLiteral) -string_view = StringView() udf_string = UDFString() +@register_model(ManagedUDFString) +class managed_udf_string_model(models.StructModel): + _members = (("meminfo", types.voidptr), ("udf_string", udf_string)) + + def __init__(self, dmm, fe_type): + super().__init__(dmm, fe_type, self._members) + + def has_nrt_meminfo(self): + return True + + def get_nrt_meminfo(self, builder, value): + udf_str_and_meminfo = cgutils.create_struct_proxy(managed_udf_string)( + cuda_target.target_context, builder, value=value + ) + return udf_str_and_meminfo.meminfo + + +managed_udf_string = ManagedUDFString() +any_string_ty = (StringView, UDFString, ManagedUDFString, types.StringLiteral) +string_view = StringView() + + class StrViewArgHandler: """ As part of Numba's preprocessing step, incoming function arguments are @@ -96,7 +129,7 @@ class StrViewArgHandler: def prepare_args(self, ty, val, **kwargs): if isinstance(ty, types.CPointer) and isinstance( - ty.dtype, (StringView, UDFString) + ty.dtype, (StringView, UDFString, ManagedUDFString) ): return types.uint64, val.ptr if isinstance( val, rmm.pylibrmm.device_buffer.DeviceBuffer @@ -124,6 +157,17 @@ def generic(self, args, kws): return nb_signature(size_type, string_view) +def NRT_decref(st): + pass + + +@cuda_decl_registry.register_global(NRT_decref) +class NRT_decref_typing(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], ManagedUDFString): + return nb_signature(types.void, managed_udf_string) + + def register_stringview_binaryop(op, retty): """ Helper function wrapping numba's low level extension API. Provides @@ -191,7 +235,7 @@ class StringViewReplace(AbstractTemplate): def generic(self, args, kws): return nb_signature( - udf_string, string_view, string_view, recvr=self.this + managed_udf_string, string_view, string_view, recvr=self.this ) @@ -232,7 +276,7 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_binary_attr(func, udf_string), + create_binary_attr(func, managed_udf_string), ) @@ -252,17 +296,17 @@ def resolve_replace(self, mod): setattr( StringViewAttrs, f"resolve_{func}", - create_identifier_attr(func, udf_string), + create_identifier_attr(func, managed_udf_string), ) @cuda_decl_registry.register_attr -class UDFStringAttrs(StringViewAttrs): - key = udf_string +class ManagedUDFStringAttrs(StringViewAttrs): + key = managed_udf_string cuda_decl_registry.register_attr(StringViewAttrs) -cuda_decl_registry.register_attr(UDFStringAttrs) +cuda_decl_registry.register_attr(ManagedUDFStringAttrs) register_stringview_binaryop(operator.eq, types.boolean) register_stringview_binaryop(operator.ne, types.boolean) @@ -275,4 +319,4 @@ class UDFStringAttrs(StringViewAttrs): register_stringview_binaryop(operator.contains, types.boolean) # st + other -register_stringview_binaryop(operator.add, udf_string) +register_stringview_binaryop(operator.add, managed_udf_string) diff --git a/python/cudf/cudf/core/udf/udf_kernel_base.py b/python/cudf/cudf/core/udf/udf_kernel_base.py index 49355ddb7a0..d0c248bfb12 100644 --- a/python/cudf/cudf/core/udf/udf_kernel_base.py +++ b/python/cudf/cudf/core/udf/udf_kernel_base.py @@ -111,7 +111,7 @@ def _get_udf_return_type(self): else: result = numpy_support.from_dtype(np.dtype(output_type)) - result = result if result.is_internal else result.return_type + result = result if result.is_internal else result.return_as # _get_udf_return_type will throw a TypingError if the user tries to use # a field in the row containing an unsupported dtype, except in the diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 0d36ab61d8b..682f5267beb 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -15,16 +15,19 @@ from numba.core.datamodel import default_manager, models from numba.core.extending import register_model from numba.np import numpy_support -from numba.types import CPointer, Record, Tuple +from numba.types import CPointer, Record, Tuple, int64, void import rmm from cudf._lib import strings_udf from cudf.core.buffer import as_buffer from cudf.core.udf.masked_typing import MaskedType +from cudf.core.udf.nrt_utils import nrt_enabled from cudf.core.udf.strings_typing import ( + NRT_decref, + managed_udf_string, + str_view_arg_handler, string_view, - udf_string, ) from cudf.utils.dtypes import ( BOOL_TYPES, @@ -252,10 +255,29 @@ def _get_input_args_from_frame(fr: IndexedFrame) -> list: def _return_arr_from_dtype(dtype, size): if dtype == CUDF_STRING_DTYPE: - return rmm.DeviceBuffer(size=size * _get_extensionty_size(udf_string)) + return rmm.DeviceBuffer( + size=size * _get_extensionty_size(managed_udf_string) + ) return cp.empty(size, dtype=dtype) +@functools.cache +def _make_free_string_kernel(): + with nrt_enabled(): + + @cuda.jit( + void(CPointer(managed_udf_string), int64), + link=[UDF_SHIM_FILE], + extensions=[str_view_arg_handler], + ) + def free_managed_udf_string_array(ary, size): + gid = cuda.grid(1) + if gid < size: + NRT_decref(ary[gid]) + + return free_managed_udf_string_array + + # The only supported data layout in NVVM. # See: https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?#data-layout _nvvm_data_layout = ( diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index dea3ca5c18c..43a3ae44f06 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -21,8 +21,14 @@ import cudf from cudf.core.column.column import as_column -from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string -from cudf.core.udf.strings_typing import StringView, string_view, udf_string +from cudf.core.udf.strings_lowering import ( + cast_string_view_to_managed_udf_string, +) +from cudf.core.udf.strings_typing import ( + StringView, + managed_udf_string, + string_view, +) from cudf.utils import dtypes as dtypeutils from cudf.utils.temporal import unit_to_nanoseconds_conversion @@ -343,9 +349,9 @@ def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): yield -def sv_to_udf_str(sv): +def sv_to_managed_udf_str(sv): """ - Cast a string_view object to a udf_string object + Cast a string_view object to a managed_udf_string object This placeholder function never runs in python It exists only for numba to have something to replace @@ -362,16 +368,16 @@ def sv_to_udf_str(sv): pass -@cuda_decl_registry.register_global(sv_to_udf_str) +@cuda_decl_registry.register_global(sv_to_managed_udf_str) class StringViewToUDFStringDecl(AbstractTemplate): def generic(self, args, kws): if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(udf_string, string_view) + return nb_signature(managed_udf_string, string_view) -@cuda_lower(sv_to_udf_str, string_view) +@cuda_lower(sv_to_managed_udf_str, string_view) def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_udf_string( + return cast_string_view_to_managed_udf_string( context, builder, sig.args[0], sig.return_type, args[0] ) diff --git a/python/cudf/cudf/tests/test_nrt_stats.py b/python/cudf/cudf/tests/test_nrt_stats.py new file mode 100644 index 00000000000..e951374f5a0 --- /dev/null +++ b/python/cudf/cudf/tests/test_nrt_stats.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from numba import config +from numba.cuda.memory_management.nrt import rtsys + +import cudf +from cudf._lib import strings_udf +from cudf.core.column import ColumnBase, as_column +from cudf.core.udf.scalar_function import SeriesApplyKernel +from cudf.core.udf.utils import ( + _get_input_args_from_frame, + _make_free_string_kernel, + _return_arr_from_dtype, +) +from cudf.utils._numba import _CUDFNumbaConfig + + +def test_string_udf_basic(monkeypatch): + monkeypatch.setattr(config, "CUDA_NRT_STATS", True) + + def double(st): + return st + st + + sr = cudf.Series(["a", "b", "c"]) + + sr.apply(double) + + stats = rtsys.get_allocation_stats() + + # one meminfo for each string that is later freed + assert stats.mi_alloc - stats.mi_free == 0 + + # one NRT_Allocate call for each string (string heap copy) + # and later its matching free + assert stats.alloc - stats.free == 0 + + +def test_string_udf_conditional_allocations(monkeypatch): + monkeypatch.setattr(config, "CUDA_NRT_STATS", True) + + # One thread allocates an intermediate string + # but the others do not + def double(st): + if st == "b": + return st + st == "BB" + return st == "a" or st == "c" + + sr = cudf.Series(["a", "b", "c"]) + + before_stats = rtsys.get_allocation_stats() + sr.apply(double) + after_stats = rtsys.get_allocation_stats() + + assert after_stats.mi_alloc - before_stats.mi_free == 1 + assert after_stats.alloc - before_stats.free == 1 + + +def test_string_udf_free_kernel(monkeypatch): + monkeypatch.setattr(config, "CUDA_NRT_STATS", True) + + def double(st): + return st + st + + sr = cudf.Series(["a", "b", "c"]) + + kernel, retty = SeriesApplyKernel(sr, double, ()).get_kernel() + + ans_col = _return_arr_from_dtype(retty, len(sr)) + ans_mask = as_column(True, length=len(sr), dtype="bool") + output_args = [(ans_col, ans_mask), len(sr)] + input_args = _get_input_args_from_frame(sr) + launch_args = output_args + input_args + + with _CUDFNumbaConfig(): + kernel.forall(len(sr))(*launch_args) + col = ColumnBase.from_pylibcudf( + strings_udf.column_from_managed_udf_string_array(ans_col) + ) + + # MemInfos that own the strings should still be alive + # and in turn, so should the heap strings + stats = rtsys.get_allocation_stats() + assert stats.mi_alloc - stats.mi_free == len(sr) + assert stats.alloc - stats.free == len(sr) + + # free kernel should equalize all allocations + free_kernel = _make_free_string_kernel() + with _CUDFNumbaConfig(): + free_kernel.forall(len(col))(ans_col, len(col)) + + stats = rtsys.get_allocation_stats() + + assert stats.mi_alloc - stats.mi_free == 0 + assert stats.alloc - stats.free == 0 diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 087d10b8295..985766b59c7 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import math import operator @@ -21,7 +21,7 @@ from cudf.testing._utils import ( _decimal_series, parametrize_numeric_dtypes_pairwise, - sv_to_udf_str, + sv_to_managed_udf_str, ) @@ -96,7 +96,7 @@ def row_wrapper(row): # prior to running the input function def udf_string_wrapper(row): masked_udf_str = Masked( - sv_to_udf_str(row["str_col"].value), row["str_col"].valid + sv_to_managed_udf_str(row["str_col"].value), row["str_col"].valid ) return func(masked_udf_str) @@ -967,6 +967,7 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) + @pytest.mark.xfail(reason="Identity function not supported.") def test_string_udf_return_string(self, str_udf_data): def func(row): return row["str_col"] diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 91dcf1111f3..6cfae5a974a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -125,6 +125,7 @@ requires = [ "libcudf==25.8.*,>=0.0.0a0", "librmm==25.8.*,>=0.0.0a0", "ninja", + "numba-cuda>=0.14.0,<0.15.0a0", "pylibcudf==25.8.*,>=0.0.0a0", "rmm==25.8.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index 568feb84571..0c12a022f22 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -26,12 +26,28 @@ rapids_find_package( INSTALL_EXPORT_SET udf-exports ) +# The below is inspired by legate dataframe +execute_process( + COMMAND "python" -c + "from numba.cuda.memory_management.nrt import get_include; print(get_include())" + OUTPUT_VARIABLE NRT_INCLUDE_DIR + ERROR_VARIABLE NRT_ERROR + RESULT_VARIABLE NRT_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if(${NRT_RESULT}) + message( + FATAL_ERROR "Error while trying to obtain numba cuda runtime include directory:\n${NRT_ERROR}" + ) +endif() + include(${rapids-cmake-dir}/cpm/cccl.cmake) rapids_cpm_cccl(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports) add_library(cudf_strings_udf SHARED strings/src/strings/udf/udf_apis.cu) target_include_directories( cudf_strings_udf PUBLIC "$" + "${NRT_INCLUDE_DIR}" ) set_target_properties( @@ -80,7 +96,7 @@ set_target_properties( target_compile_options(shim PRIVATE "$<$:${SHIM_CUDA_FLAGS}>") target_link_libraries(shim PUBLIC cudf::cudf) target_include_directories( - shim PUBLIC "$" + shim PUBLIC "$" "${NRT_INCLUDE_DIR}" ) install( diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu index 5d0aabc3907..535358bfc9a 100644 --- a/python/cudf/udf_cpp/shim.cu +++ b/python/cudf/udf_cpp/shim.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,11 +26,55 @@ #include #include +#include + #include #include using namespace cudf::strings::udf; +/** + * @brief Destructor for a udf_string object. + * + * NRT API compatible destructor for udf_string objects. + * + * @param udf_str Pointer to the udf_string object to be destructed. + * @param size Size of the udf_string object (not used). + * @param dtor_info Additional information for the destructor (not used). + */ +__device__ void udf_str_dtor(void* udf_str, size_t size, void* dtor_info) +{ + auto ptr = reinterpret_cast(udf_str); + ptr->~udf_string(); +} + +__device__ NRT_MemInfo* make_meminfo_for_new_udf_string(udf_string* udf_str) +{ + // only used in the context of this function + struct mi_str_allocation { + NRT_MemInfo mi; + udf_string st; + }; + + mi_str_allocation* mi_and_str = (mi_str_allocation*)NRT_Allocate(sizeof(mi_str_allocation)); + if (mi_and_str != NULL) { + auto mi_ptr = &(mi_and_str->mi); + udf_string* st_ptr = &(mi_and_str->st); + + // udf_str_dtor can destruct the string without knowing the size + size_t size = 0; + NRT_MemInfo_init(mi_ptr, st_ptr, size, udf_str_dtor, NULL); + + // copy the udf_string to the allocated heap space + udf_string* in_str_ptr = reinterpret_cast(udf_str); + memcpy(st_ptr, in_str_ptr, sizeof(udf_string)); + return mi_ptr; + } else { + __trap(); + return nullptr; + } +} + extern "C" __device__ int len(int* nb_retval, void const* str) { auto sv = reinterpret_cast(str); @@ -227,14 +271,13 @@ extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* s return 0; } -extern "C" __device__ int udf_string_from_string_view(int* nb_retbal, +extern "C" __device__ int udf_string_from_string_view(void** out_meminfo, void const* str, void* udf_str) { auto str_view_ptr = reinterpret_cast(str); - auto udf_str_ptr = new (udf_str) udf_string; - *udf_str_ptr = udf_string(*str_view_ptr); - + auto udf_str_ptr = new (udf_str) udf_string(*str_view_ptr); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } @@ -243,62 +286,56 @@ extern "C" __device__ int string_view_from_udf_string(int* nb_retval, void* str) { auto udf_str_ptr = reinterpret_cast(udf_str); - auto sv_ptr = new (str) cudf::string_view; - *sv_ptr = cudf::string_view(*udf_str_ptr); - + auto sv_ptr = new (str) cudf::string_view(*udf_str_ptr); return 0; } -extern "C" __device__ int strip(int* nb_retval, +extern "C" __device__ int strip(void** out_meminfo, void* udf_str, void* const* to_strip, void* const* strip_str) { auto to_strip_ptr = reinterpret_cast(to_strip); auto strip_str_ptr = reinterpret_cast(strip_str); - auto udf_str_ptr = new (udf_str) udf_string; - - *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr); - + auto udf_str_ptr = new (udf_str) udf_string(strip(*to_strip_ptr, *strip_str_ptr)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int lstrip(int* nb_retval, +extern "C" __device__ int lstrip(void** out_meminfo, void* udf_str, void* const* to_strip, void* const* strip_str) { auto to_strip_ptr = reinterpret_cast(to_strip); auto strip_str_ptr = reinterpret_cast(strip_str); - auto udf_str_ptr = new (udf_str) udf_string; - - *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::LEFT); - + auto udf_str_ptr = + new (udf_str) udf_string(strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::LEFT)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int rstrip(int* nb_retval, +extern "C" __device__ int rstrip(void** out_meminfo, void* udf_str, void* const* to_strip, void* const* strip_str) { auto to_strip_ptr = reinterpret_cast(to_strip); auto strip_str_ptr = reinterpret_cast(strip_str); - auto udf_str_ptr = new (udf_str) udf_string; - - *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::RIGHT); - + auto udf_str_ptr = + new (udf_str) udf_string(strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::RIGHT)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int upper(int* nb_retval, + +extern "C" __device__ int upper(void** out_meminfo, void* udf_str, void const* st, std::uintptr_t flags_table, std::uintptr_t cases_table, std::uintptr_t special_table) { - auto udf_str_ptr = new (udf_str) udf_string; - auto st_ptr = reinterpret_cast(st); + auto st_ptr = reinterpret_cast(st); auto flags_table_ptr = reinterpret_cast(flags_table); @@ -309,20 +346,19 @@ extern "C" __device__ int upper(int* nb_retval, cudf::strings::udf::chars_tables tables{flags_table_ptr, cases_table_ptr, special_table_ptr}; - *udf_str_ptr = to_upper(tables, *st_ptr); - + auto udf_str_ptr = new (udf_str) udf_string(to_upper(tables, *st_ptr)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int lower(int* nb_retval, +extern "C" __device__ int lower(void** out_meminfo, void* udf_str, void const* st, std::uintptr_t flags_table, std::uintptr_t cases_table, std::uintptr_t special_table) { - auto udf_str_ptr = new (udf_str) udf_string; - auto st_ptr = reinterpret_cast(st); + auto st_ptr = reinterpret_cast(st); auto flags_table_ptr = reinterpret_cast(flags_table); @@ -332,33 +368,39 @@ extern "C" __device__ int lower(int* nb_retval, reinterpret_cast(special_table); cudf::strings::udf::chars_tables tables{flags_table_ptr, cases_table_ptr, special_table_ptr}; - *udf_str_ptr = to_lower(tables, *st_ptr); + + auto udf_str_ptr = new (udf_str) udf_string(to_lower(tables, *st_ptr)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs, void* const* rhs) +extern "C" __device__ int concat(void** out_meminfo, + void* udf_str, + void* const* lhs, + void* const* rhs) { auto lhs_ptr = reinterpret_cast(lhs); auto rhs_ptr = reinterpret_cast(rhs); - auto udf_str_ptr = new (udf_str) udf_string; - udf_string result; result.append(*lhs_ptr).append(*rhs_ptr); - *udf_str_ptr = result; + auto udf_str_ptr = new (udf_str) udf_string(std::move(result)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } -extern "C" __device__ int replace( - int* nb_retval, void* udf_str, void* const src, void* const to_replace, void* const replacement) +extern "C" __device__ int replace(void** out_meminfo, + void* udf_str, + void* const src, + void* const to_replace, + void* const replacement) { auto src_ptr = reinterpret_cast(src); auto to_replace_ptr = reinterpret_cast(to_replace); auto replacement_ptr = reinterpret_cast(replacement); - auto udf_str_ptr = new (udf_str) udf_string; - *udf_str_ptr = replace(*src_ptr, *to_replace_ptr, *replacement_ptr); - + auto udf_str_ptr = new (udf_str) udf_string(replace(*src_ptr, *to_replace_ptr, *replacement_ptr)); + *out_meminfo = make_meminfo_for_new_udf_string(udf_str_ptr); return 0; } diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/managed_udf_string.cuh b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/managed_udf_string.cuh new file mode 100644 index 00000000000..fc586045143 --- /dev/null +++ b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/managed_udf_string.cuh @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace cudf::strings::udf { + +/** + * @brief Container for a udf_string and its NRT memory information + * + * `meminfo` is a MemInfo struct from numba-cuda, see: + * https://github.com/NVIDIA/numba-cuda/blob/main/numba_cuda/numba/cuda/memory_management/nrt.cuh + */ +struct managed_udf_string { + void* meminfo; + cudf::strings::udf::udf_string udf_str; +}; + +} // namespace cudf::strings::udf diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp index 8635b1280de..ba7f05f6d01 100644 --- a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp +++ b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include @@ -35,7 +36,6 @@ namespace udf { int get_cuda_build_version(); class udf_string; - /** * @brief Return a cudf::string_view array for the given strings column * @@ -50,30 +50,17 @@ class udf_string; std::unique_ptr to_string_view_array(cudf::column_view const input); /** - * @brief Return a STRINGS column given an array of udf_string objects + * @brief Return a strings column given an array of managed_udf_string objects * - * This will make a copy of the strings in d_string in order to build + * This will make a copy of the strings in managed_strings in order to build * the output column. - * The individual udf_strings are also cleared freeing each of their internal - * device memory buffers. - * - * @param d_strings Pointer to device memory of udf_string objects - * @param size The number of elements in the d_strings array - * @return A strings column copy of the udf_string objects - */ -std::unique_ptr column_from_udf_string_array(udf_string* d_strings, - cudf::size_type size); - -/** - * @brief Frees a vector of udf_string objects - * - * The individual udf_strings are cleared freeing each of their internal - * device memory buffers. * - * @param d_strings Pointer to device memory of udf_string objects - * @param size The number of elements in the d_strings array + * @param managed_strings Pointer to device memory of managed_udf_string objects + * @param size The number of elements in the managed_strings array + * @return A strings column copy of the managed_udf_string objects */ -void free_udf_string_array(udf_string* d_strings, cudf::size_type size); +std::unique_ptr column_from_managed_udf_string_array( + managed_udf_string* managed_strings, cudf::size_type size); } // namespace udf } // namespace strings diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu index 6fab2684ce4..710936d2df4 100644 --- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu +++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -31,7 +32,9 @@ namespace cudf { namespace strings { namespace udf { + namespace detail { + namespace { /** @@ -39,11 +42,13 @@ namespace { * * No string data is copied. */ -struct udf_string_to_string_view_transform_fn { - __device__ cudf::string_view operator()(cudf::strings::udf::udf_string const& dstr) +struct managed_udf_string_to_string_view_transform_fn { + __device__ cudf::string_view operator()( + cudf::strings::udf::managed_udf_string const& managed_dstr) { - return dstr.data() == nullptr ? cudf::string_view{} - : cudf::string_view{dstr.data(), dstr.size_bytes()}; + return managed_dstr.udf_str.data() == nullptr + ? cudf::string_view{} + : cudf::string_view{managed_dstr.udf_str.data(), managed_dstr.udf_str.size_bytes()}; } }; @@ -64,38 +69,24 @@ std::unique_ptr to_string_view_array(cudf::column_view const } /** - * @copydoc column_from_udf_string_array + * @copydoc column_from_managed_udf_string_array * * @param stream CUDA stream used for allocating/copying device memory and launching kernels */ -std::unique_ptr column_from_udf_string_array(udf_string* d_strings, - cudf::size_type size, - rmm::cuda_stream_view stream) +std::unique_ptr column_from_managed_udf_string_array( + managed_udf_string* managed_strings, cudf::size_type size, rmm::cuda_stream_view stream) { // create string_views of the udf_strings auto indices = rmm::device_uvector(size, stream); thrust::transform(rmm::exec_policy(stream), - d_strings, - d_strings + size, + managed_strings, + managed_strings + size, indices.data(), - udf_string_to_string_view_transform_fn{}); - - return cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); -} + managed_udf_string_to_string_view_transform_fn{}); -/** - * @copydoc free_udf_string_array - * - * @param stream CUDA stream used for allocating/copying device memory and launching kernels - */ -void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, - cudf::size_type size, - rmm::cuda_stream_view stream) -{ - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - size, - [d_strings] __device__(auto idx) { d_strings[idx].clear(); }); + auto result = cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream); + stream.synchronize(); + return result; } } // namespace detail @@ -109,15 +100,11 @@ std::unique_ptr to_string_view_array(cudf::column_view const return detail::to_string_view_array(input, cudf::get_default_stream()); } -std::unique_ptr column_from_udf_string_array(udf_string* d_strings, - cudf::size_type size) -{ - return detail::column_from_udf_string_array(d_strings, size, cudf::get_default_stream()); -} - -void free_udf_string_array(udf_string* d_strings, cudf::size_type size) +std::unique_ptr column_from_managed_udf_string_array( + managed_udf_string* managed_strings, cudf::size_type size) { - detail::free_udf_string_array(d_strings, size, cudf::get_default_stream()); + return detail::column_from_managed_udf_string_array( + managed_strings, size, cudf::get_default_stream()); } } // namespace udf diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd index a2654eaab16..996ce3d54ad 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. from libc.stdint cimport uint8_t, uint16_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -10,22 +10,19 @@ from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer - -cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ +cdef extern from "cudf/strings/udf/managed_udf_string.cuh" namespace \ "cudf::strings::udf" nogil: - cdef cppclass udf_string + cdef cppclass managed_udf_string cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: + cdef int get_cuda_build_version() except +libcudf_exception_handler cdef unique_ptr[device_buffer] to_string_view_array( column_view ) except +libcudf_exception_handler - cdef unique_ptr[column] column_from_udf_string_array( - udf_string* strings, size_type size, - ) except +libcudf_exception_handler - cdef void free_udf_string_array( - udf_string* strings, size_type size + cdef unique_ptr[column] column_from_managed_udf_string_array( + managed_udf_string* strings, size_type size, ) except +libcudf_exception_handler cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \