Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add interop support from arrow StringView to libcudf strings column #18107

Open
wants to merge 18 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ add_library(
src/interop/to_arrow_host.cu
src/interop/from_arrow_device.cu
src/interop/from_arrow_host.cu
src/interop/from_arrow_host_strings.cu
src/interop/from_arrow_stream.cu
src/interop/to_arrow_schema.cpp
src/io/avro/avro.cpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)

# ##################################################################################################
# * interop benchmark ------------------------------------------------------------------------------
ConfigureNVBench(INTEROP_NVBENCH interop/interop.cpp)
ConfigureNVBench(INTEROP_NVBENCH interop/interop.cpp interop/interop_stringview.cpp)
target_link_libraries(INTEROP_NVBENCH PRIVATE nanoarrow)

# ##################################################################################################
Expand Down
71 changes: 71 additions & 0 deletions cpp/benchmarks/interop/interop_stringview.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/nanoarrow_utils.hpp>

#include <cudf/interop.hpp>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <random>
#include <string>
#include <vector>

void BM_from_arrow_host_stringview(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

auto stream = cudf::get_default_stream();

std::string characters('x', max_width); // actual data is not important
std::random_device rd;
std::mt19937 generator(rd());
std::uniform_int_distribution<> distribution(min_width, max_width);

ArrowArray input;
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(&input, NANOARROW_TYPE_STRING_VIEW));
NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(&input));
auto total_size = 0L;
for (auto i = 0; i < num_rows; ++i) {
auto const size = distribution(generator);
auto const ptr = characters.data();
total_size += size;
ArrowArrayAppendString(&input, {ptr, size});
}
NANOARROW_THROW_NOT_OK(
ArrowArrayFinishBuilding(&input, NANOARROW_VALIDATION_LEVEL_NONE, nullptr));

state.add_element_count(num_rows, "num_rows");
state.add_global_memory_reads(total_size);
state.add_global_memory_writes(total_size);
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));

ArrowSchema schema;
ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_STRING_VIEW);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::from_arrow_column(&schema, &input);
});
}

NVBENCH_BENCH(BM_from_arrow_host_stringview)
.set_name("from_arrow_host_stringview")
.add_int64_axis("num_rows", {10'000, 100'000, 1'000'000})
.add_int64_axis("min_width", {1})
.add_int64_axis("max_width", {10, 100, 1000});
1 change: 1 addition & 0 deletions cpp/src/interop/arrow_utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_STRING_VIEW:
case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING);
case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
Expand Down
4 changes: 1 addition & 3 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@

#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <nanoarrow/nanoarrow.h>

namespace cudf {
Expand Down
110 changes: 14 additions & 96 deletions cpp/src/interop/from_arrow_host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "arrow_utilities.hpp"
#include "from_arrow_host.hpp"

#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
Expand Down Expand Up @@ -119,22 +120,6 @@ struct dispatch_copy_from_arrow_host {
}
};

// forward declaration is needed because `type_dispatch` instantiates the
// dispatch_copy_from_arrow_host struct causing a recursive situation for struct,
// dictionary and list_view types.
//
// This function is simply a convenience wrapper around the dispatch functor with
// some extra handling to avoid having to reproduce it for all of the nested types.
// It also allows us to centralize the location where the recursive calls happen
// so that we only need to forward declare this one function, rather than multiple
// functions which handle the overloads for nested types (list, struct, etc.)
std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
bool skip_mask,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template <>
std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
ArrowArray const* input,
Expand Down Expand Up @@ -175,84 +160,7 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
{
if (input->length == 0) { return make_empty_column(type_id::STRING); }

// offsets column should contain no nulls so we can put nullptr for the bitmask
// nulls are tracked in the parent string column itself, not in the offsets
void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
ArrowArray offsets_array = {
.length = input->offset + input->length + 1,
.null_count = 0,
.offset = 0,
.n_buffers = 2,
.n_children = 0,
.buffers = offset_buffers,
};

// chars_column does not contain any nulls, they are tracked by the parent string column
// itself instead. So we pass nullptr for the validity bitmask.
int64_t const char_data_length = [&]() {
if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
return reinterpret_cast<int64_t const*>(offset_buffers[1])[input->length + input->offset];
} else if (schema->type == NANOARROW_TYPE_STRING) {
return static_cast<int64_t>(
reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset]);
} else {
CUDF_FAIL("Unsupported string type", cudf::data_type_error);
}
}();
void const* char_buffers[2] = {nullptr, input->buffers[2]};
ArrowArray char_array = {
.length = char_data_length,
.null_count = 0,
.offset = 0,
.n_buffers = 2,
.n_children = 0,
.buffers = char_buffers,
};

nanoarrow::UniqueSchema offset_schema;
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32));

nanoarrow::UniqueSchema char_data_schema;
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8));

// leverage the dispatch overloads for int32 and char(int8) to generate the child
// offset and char data columns for us.
ArrowSchemaView view;
NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
auto offsets_column = [&]() {
if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
return this->operator()<int64_t>(&view, &offsets_array, data_type(type_id::INT64), true);
} else if (schema->type == NANOARROW_TYPE_STRING) {
return this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
} else {
CUDF_FAIL("Unsupported string type", cudf::data_type_error);
}
}();
NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));

rmm::device_buffer chars(char_data_length, stream, mr);
CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
reinterpret_cast<uint8_t const*>(char_array.buffers[1]),
chars.size(),
cudaMemcpyDefault,
stream.value()));
auto const num_rows = offsets_column->size() - 1;
auto out_col = make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars),
input->null_count,
std::move(*get_mask_buffer(input)));

return input->offset == 0
? std::move(out_col)
: std::make_unique<column>(
cudf::detail::slice(out_col->view(),
static_cast<size_type>(input->offset),
static_cast<size_type>(input->offset + input->length),
stream),
stream,
mr);
return string_column_from_arrow_host(schema, input, get_mask_buffer(input), stream, mr);
}

template <>
Expand Down Expand Up @@ -378,6 +286,8 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::list_vie
mr);
}

} // namespace

std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
Expand All @@ -400,8 +310,6 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
input->length);
}

} // namespace

std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -457,6 +365,16 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
return get_column_copy(&view, &input->array, type, false, stream, mr);
}

std::unique_ptr<column> get_column_from_host_copy(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
bool skip_mask,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
return get_column_copy(schema, input, type, skip_mask, stream, mr);
}

} // namespace detail

std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
Expand Down
68 changes: 68 additions & 0 deletions cpp/src/interop/from_arrow_host.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/resource_ref.hpp>

#include <nanoarrow/nanoarrow.hpp>

namespace cudf {
namespace detail {

/**
* @brief Utility to handle STRING, LARGE_STRINGS, and STRING_VIEW types
*
* @param schema Arrow schema includes the column type
* @param input Column data, nulls, offset
* @param mask Mask to apply to the output column
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource to use for all device memory allocations
*/
std::unique_ptr<column> string_column_from_arrow_host(ArrowSchemaView* schema,
ArrowArray const* input,
std::unique_ptr<rmm::device_buffer>&& mask,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Convert ArrowArray to cudf column utility
*
* This function is simply a convenience wrapper around the dispatch functor with
* some extra handling to avoid having to reproduce it for all of the nested types.
* It also allows us to centralize the location where the recursive calls happen
* so that we only need to forward declare this one function, rather than multiple
* functions which handle the overloads for nested types (list, struct, etc.)
*
* @param schema Arrow schema includes the column type
* @param input Column data, nulls, offset
* @param type The cudf column type to map input to
* @param skip_mask True if the mask is handled by the caller
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource to use for all device memory allocations
*/
std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
bool skip_mask,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

} // namespace detail
} // namespace cudf
Loading