rapidsai · davidwendt · Feb 26, 2025 · Feb 27, 2025 · Feb 28, 2025 · Mar 3, 2025
@@ -467,6 +467,7 @@ add_library(
   src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
+  src/interop/from_arrow_host_strings.cu
   src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
   src/io/avro/avro.cpp

@@ -289,7 +289,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 
 # ##################################################################################################
 # * interop benchmark ------------------------------------------------------------------------------
-ConfigureNVBench(INTEROP_NVBENCH interop/interop.cpp)
+ConfigureNVBench(INTEROP_NVBENCH interop/interop.cpp interop/interop_stringview.cpp)
 target_link_libraries(INTEROP_NVBENCH PRIVATE nanoarrow)
 
 # ##################################################################################################

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/nanoarrow_utils.hpp>
+
+#include <cudf/interop.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <random>
+#include <string>
+#include <vector>
+
+void BM_from_arrow_host_stringview(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+
+  auto stream = cudf::get_default_stream();
+
+  std::string characters('x', max_width);  // actual data is not important
+  std::random_device rd;
+  std::mt19937 generator(rd());
+  std::uniform_int_distribution<> distribution(min_width, max_width);
+
+  ArrowArray input;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(&input, NANOARROW_TYPE_STRING_VIEW));
+  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(&input));
+  auto total_size = 0L;
+  for (auto i = 0; i < num_rows; ++i) {
+    auto const size = distribution(generator);
+    auto const ptr  = characters.data();
+    total_size += size;
+    ArrowArrayAppendString(&input, {ptr, size});
+  }
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(&input, NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  state.add_element_count(num_rows, "num_rows");
+  state.add_global_memory_reads(total_size);
+  state.add_global_memory_writes(total_size);
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  ArrowSchema schema;
+  ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_STRING_VIEW);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::from_arrow_column(&schema, &input);
+  });
+}
+
+NVBENCH_BENCH(BM_from_arrow_host_stringview)
+  .set_name("from_arrow_host_stringview")
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000})
+  .add_int64_axis("min_width", {1})
+  .add_int64_axis("max_width", {10, 100, 1000});
@@ -41,6 +41,7 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
     case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
     case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
     case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_STRING_VIEW:
     case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING);
     case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
     case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);

@@ -16,11 +16,9 @@
 
 #pragma once
 
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {

@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "from_arrow_host.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -119,22 +120,6 @@ struct dispatch_copy_from_arrow_host {
   }
 };
 
-// forward declaration is needed because `type_dispatch` instantiates the
-// dispatch_copy_from_arrow_host struct causing a recursive situation for struct,
-// dictionary and list_view types.
-//
-// This function is simply a convenience wrapper around the dispatch functor with
-// some extra handling to avoid having to reproduce it for all of the nested types.
-// It also allows us to centralize the location where the recursive calls happen
-// so that we only need to forward declare this one function, rather than multiple
-// functions which handle the overloads for nested types (list, struct, etc.)
-std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
-                                        ArrowArray const* input,
-                                        data_type type,
-                                        bool skip_mask,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr);
-
 template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
                                                                         ArrowArray const* input,
@@ -175,84 +160,7 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
   ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
 {
   if (input->length == 0) { return make_empty_column(type_id::STRING); }
-
-  // offsets column should contain no nulls so we can put nullptr for the bitmask
-  // nulls are tracked in the parent string column itself, not in the offsets
-  void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
-  ArrowArray offsets_array      = {
-         .length     = input->offset + input->length + 1,
-         .null_count = 0,
-         .offset     = 0,
-         .n_buffers  = 2,
-         .n_children = 0,
-         .buffers    = offset_buffers,
-  };
-
-  // chars_column does not contain any nulls, they are tracked by the parent string column
-  // itself instead. So we pass nullptr for the validity bitmask.
-  int64_t const char_data_length = [&]() {
-    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
-      return reinterpret_cast<int64_t const*>(offset_buffers[1])[input->length + input->offset];
-    } else if (schema->type == NANOARROW_TYPE_STRING) {
-      return static_cast<int64_t>(
-        reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset]);
-    } else {
-      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
-    }
-  }();
-  void const* char_buffers[2] = {nullptr, input->buffers[2]};
-  ArrowArray char_array       = {
-          .length     = char_data_length,
-          .null_count = 0,
-          .offset     = 0,
-          .n_buffers  = 2,
-          .n_children = 0,
-          .buffers    = char_buffers,
-  };
-
-  nanoarrow::UniqueSchema offset_schema;
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32));
-
-  nanoarrow::UniqueSchema char_data_schema;
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8));
-
-  // leverage the dispatch overloads for int32 and char(int8) to generate the child
-  // offset and char data columns for us.
-  ArrowSchemaView view;
-  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
-  auto offsets_column = [&]() {
-    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
-      return this->operator()<int64_t>(&view, &offsets_array, data_type(type_id::INT64), true);
-    } else if (schema->type == NANOARROW_TYPE_STRING) {
-      return this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
-    } else {
-      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
-    }
-  }();
-  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));
-
-  rmm::device_buffer chars(char_data_length, stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
-                                reinterpret_cast<uint8_t const*>(char_array.buffers[1]),
-                                chars.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  auto const num_rows = offsets_column->size() - 1;
-  auto out_col        = make_strings_column(num_rows,
-                                     std::move(offsets_column),
-                                     std::move(chars),
-                                     input->null_count,
-                                     std::move(*get_mask_buffer(input)));
-
-  return input->offset == 0
-           ? std::move(out_col)
-           : std::make_unique<column>(
-               cudf::detail::slice(out_col->view(),
-                                   static_cast<size_type>(input->offset),
-                                   static_cast<size_type>(input->offset + input->length),
-                                   stream),
-               stream,
-               mr);
+  return string_column_from_arrow_host(schema, input, get_mask_buffer(input), stream, mr);
 }
 
 template <>
@@ -378,6 +286,8 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::list_vie
                mr);
 }
 
+}  // namespace
+
 std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         ArrowArray const* input,
                                         data_type type,
@@ -400,8 +310,6 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                       input->length);
 }
 
-}  // namespace
-
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
@@ -457,6 +365,16 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
   return get_column_copy(&view, &input->array, type, false, stream, mr);
 }
 
+std::unique_ptr<column> get_column_from_host_copy(ArrowSchemaView* schema,
+                                                  ArrowArray const* input,
+                                                  data_type type,
+                                                  bool skip_mask,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
+{
+  return get_column_copy(schema, input, type, skip_mask, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Utility to handle STRING, LARGE_STRINGS, and STRING_VIEW types
+ *
+ * @param schema Arrow schema includes the column type
+ * @param input Column data, nulls, offset
+ * @param mask Mask to apply to the output column
+ * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for all device memory allocations
+ */
+std::unique_ptr<column> string_column_from_arrow_host(ArrowSchemaView* schema,
+                                                      ArrowArray const* input,
+                                                      std::unique_ptr<rmm::device_buffer>&& mask,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
+/**
+ * @brief Convert ArrowArray to cudf column utility
+ *
+ * This function is simply a convenience wrapper around the dispatch functor with
+ * some extra handling to avoid having to reproduce it for all of the nested types.
+ * It also allows us to centralize the location where the recursive calls happen
+ * so that we only need to forward declare this one function, rather than multiple
+ * functions which handle the overloads for nested types (list, struct, etc.)
+ *
+ * @param schema Arrow schema includes the column type
+ * @param input Column data, nulls, offset
+ * @param type The cudf column type to map input to
+ * @param skip_mask True if the mask is handled by the caller
+ * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for all device memory allocations
+ */
+std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
+                                        ArrowArray const* input,
+                                        data_type type,
+                                        bool skip_mask,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr);
+
+}  // namespace detail
+}  // namespace cudf