From cf55f951a06e7c9b181021bd2dc2e6d2b3f954f0 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 11 Mar 2025 14:58:08 +0000
Subject: [PATCH 01/37] Add minimal code that demonstrates cudf integration

---
 CMake/resolve_dependency_modules/cudf.cmake   |  48 ++
 CMakeLists.txt                                |  18 +-
 Makefile                                      |   6 +-
 velox/CMakeLists.txt                          |   3 +
 velox/experimental/cudf/CMakeLists.txt        |  21 +
 velox/experimental/cudf/exec/CMakeLists.txt   |  34 ++
 .../experimental/cudf/exec/CudfConversion.cpp | 199 +++++++
 velox/experimental/cudf/exec/CudfConversion.h |  97 ++++
 velox/experimental/cudf/exec/CudfOrderBy.cpp  | 128 +++++
 velox/experimental/cudf/exec/CudfOrderBy.h    |  69 +++
 velox/experimental/cudf/exec/NvtxHelper.h     |  66 +++
 velox/experimental/cudf/exec/ToCudf.cpp       | 360 +++++++++++++
 velox/experimental/cudf/exec/ToCudf.h         |  52 ++
 velox/experimental/cudf/exec/Utilities.cpp    | 148 +++++
 velox/experimental/cudf/exec/Utilities.h      |  59 ++
 .../cudf/exec/VeloxCudfInterop.cpp            | 508 ++++++++++++++++++
 .../experimental/cudf/exec/VeloxCudfInterop.h |  64 +++
 velox/experimental/cudf/tests/CMakeLists.txt  |  33 ++
 velox/experimental/cudf/tests/OrderByTest.cpp | 417 ++++++++++++++
 velox/experimental/cudf/vector/CMakeLists.txt |  26 +
 velox/experimental/cudf/vector/CudfVector.cpp |  21 +
 velox/experimental/cudf/vector/CudfVector.h   |  73 +++
 22 files changed, 2443 insertions(+), 7 deletions(-)
 create mode 100644 CMake/resolve_dependency_modules/cudf.cmake
 create mode 100644 velox/experimental/cudf/CMakeLists.txt
 create mode 100644 velox/experimental/cudf/exec/CMakeLists.txt
 create mode 100644 velox/experimental/cudf/exec/CudfConversion.cpp
 create mode 100644 velox/experimental/cudf/exec/CudfConversion.h
 create mode 100644 velox/experimental/cudf/exec/CudfOrderBy.cpp
 create mode 100644 velox/experimental/cudf/exec/CudfOrderBy.h
 create mode 100644 velox/experimental/cudf/exec/NvtxHelper.h
 create mode 100644 velox/experimental/cudf/exec/ToCudf.cpp
 create mode 100644 velox/experimental/cudf/exec/ToCudf.h
 create mode 100644 velox/experimental/cudf/exec/Utilities.cpp
 create mode 100644 velox/experimental/cudf/exec/Utilities.h
 create mode 100644 velox/experimental/cudf/exec/VeloxCudfInterop.cpp
 create mode 100644 velox/experimental/cudf/exec/VeloxCudfInterop.h
 create mode 100644 velox/experimental/cudf/tests/CMakeLists.txt
 create mode 100644 velox/experimental/cudf/tests/OrderByTest.cpp
 create mode 100644 velox/experimental/cudf/vector/CMakeLists.txt
 create mode 100644 velox/experimental/cudf/vector/CudfVector.cpp
 create mode 100644 velox/experimental/cudf/vector/CudfVector.h

diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake
new file mode 100644
index 00000000000..a6abdb61254
--- /dev/null
+++ b/CMake/resolve_dependency_modules/cudf.cmake
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include_guard(GLOBAL)
+
+set(VELOX_cudf_VERSION 25.04)
+set(VELOX_cudf_BUILD_SHA256_CHECKSUM
+    e5a1900dfaf23dab2c5808afa17a2d04fa867d2892ecec1cb37908f3b73715c2)
+set(VELOX_cudf_SOURCE_URL
+    "https://github.com/rapidsai/cudf/archive/4c1c99011da2c23856244e05adda78ba66697105.tar.gz"
+)
+velox_resolve_dependency_url(cudf)
+
+# Use block so we don't leak variables
+block(SCOPE_FOR VARIABLES)
+# Setup libcudf build to not have testing components
+set(BUILD_TESTS OFF)
+set(CUDF_BUILD_TESTUTIL OFF)
+set(BUILD_SHARED_LIBS ON)
+
+# cudf sets all warnings as errors, and therefore fails to compile with velox
+# expanded set of warnings. We selectively disable problematic warnings just for
+# cudf
+string(
+  APPEND CMAKE_CXX_FLAGS
+  " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy")
+
+FetchContent_Declare(
+  cudf
+  URL ${VELOX_cudf_SOURCE_URL}
+  URL_HASH ${VELOX_cudf_BUILD_SHA256_CHECKSUM}
+  SOURCE_SUBDIR cpp
+  UPDATE_DISCONNECTED 1)
+
+FetchContent_MakeAvailable(cudf)
+unset(BUILD_SHARED_LIBS)
+endblock()
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aae0cfff990..173a40fe1d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -235,6 +235,7 @@ if(VELOX_ENABLE_CCACHE
     message(STATUS "Using ccache: ${CCACHE_FOUND}")
     set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_FOUND})
     set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_FOUND})
+    set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_FOUND})
     # keep comments as they might matter to the compiler
     set(ENV{CCACHE_COMMENTS} "1")
   endif()
@@ -368,6 +369,8 @@ if(ENABLE_ALL_WARNINGS)
        -Wno-unused-parameter \
        -Wno-sign-compare \
        -Wno-ignored-qualifiers \
+       -Wno-missing-field-initializers \
+       -Wno-deprecated-copy \
        ${KNOWN_COMPILER_SPECIFIC_WARNINGS}")
 
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra ${KNOWN_WARNINGS}")
@@ -375,7 +378,12 @@ endif()
 
 message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
 
-if(${VELOX_ENABLE_GPU})
+if(NOT TARGET fmt::fmt)
+  velox_set_source(fmt)
+  velox_resolve_dependency(fmt 9.0.0)
+endif()
+
+if(VELOX_ENABLE_GPU)
   enable_language(CUDA)
   # Determine CUDA_ARCHITECTURES automatically.
   cmake_policy(SET CMP0104 NEW)
@@ -387,6 +395,11 @@ if(${VELOX_ENABLE_GPU})
     add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
   endif()
   find_package(CUDAToolkit REQUIRED)
+  if(VELOX_ENABLE_CUDF)
+    set(VELOX_ENABLE_ARROW ON)
+    velox_set_source(cudf)
+    velox_resolve_dependency(cudf)
+  endif()
 endif()
 
 # Set after the test of the CUDA compiler. Otherwise, the test fails with
@@ -458,9 +471,6 @@ else()
 endif()
 velox_resolve_dependency(glog)
 
-velox_set_source(fmt)
-velox_resolve_dependency(fmt 9.0.0)
-
 if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR})
   # DWIO needs all sorts of stream compression libraries.
   #
diff --git a/Makefile b/Makefile
index bb6f9af3d19..7306089253b 100644
--- a/Makefile
+++ b/Makefile
@@ -99,7 +99,7 @@ cmake:					#: Use CMake to create a Makefile build system
 		${EXTRA_CMAKE_FLAGS}
 
 cmake-gpu:
-	$(MAKE) EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" cmake
+	$(MAKE) EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON" cmake
 
 build:					#: Build the software based in BUILD_DIR and BUILD_TYPE variables
 	cmake --build $(BUILD_BASE_DIR)/$(BUILD_DIR) -j $(NUM_THREADS)
@@ -123,11 +123,11 @@ minimal:				 #: Minimal build
 	$(MAKE) build BUILD_DIR=release
 
 gpu:						 #: Build with GPU support
-	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
+	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON"
 	$(MAKE) build BUILD_DIR=release
 
 gpu_debug:			 #: Build with debugging symbols and GPU support
-	$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
+	$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON"
 	$(MAKE) build BUILD_DIR=debug
 
 dwio:						#: Minimal build with dwio enabled.
diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt
index b51e2a77073..09a4e7c58cf 100644
--- a/velox/CMakeLists.txt
+++ b/velox/CMakeLists.txt
@@ -69,6 +69,9 @@ if(${VELOX_ENABLE_DUCKDB})
 endif()
 
 if(${VELOX_ENABLE_GPU})
+  if(${VELOX_ENABLE_CUDF})
+    add_subdirectory(experimental/cudf)
+  endif()
   add_subdirectory(experimental/gpu)
   add_subdirectory(experimental/wave)
   add_subdirectory(external/jitify)
diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt
new file mode 100644
index 00000000000..96fcdb0d557
--- /dev/null
+++ b/velox/experimental/cudf/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(exec)
+add_subdirectory(connectors)
+add_subdirectory(vector)
+
+if(VELOX_BUILD_TESTING)
+  add_subdirectory(tests)
+endif()
diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt
new file mode 100644
index 00000000000..c47d3a5065a
--- /dev/null
+++ b/velox/experimental/cudf/exec/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(
+  velox_cudf_exec
+  CudfConversion.cpp
+  CudfOrderBy.cpp
+  ToCudf.cpp
+  Utilities.cpp
+  VeloxCudfInterop.cpp)
+
+set_target_properties(
+  velox_cudf_exec
+  PROPERTIES CUDA_ARCHITECTURES native)
+
+target_link_libraries(
+  velox_cudf_exec
+  cudf::cudf
+  arrow
+  velox_arrow_bridge
+  velox_exception
+  velox_common_base
+  velox_exec)
diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
new file mode 100644
index 00000000000..68b3ac283de
--- /dev/null
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/exec/Driver.h"
+#include "velox/exec/Operator.h"
+#include "velox/vector/ComplexVector.h"
+
+#include <cudf/concatenate.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include "velox/experimental/cudf/exec/CudfConversion.h"
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
+namespace facebook::velox::cudf_velox {
+
+namespace {
+// Concatenate multiple RowVectors into a single RowVector.
+// Copied from AggregationFuzzer.cpp.
+RowVectorPtr mergeRowVectors(
+    const std::vector<RowVectorPtr>& results,
+    velox::memory::MemoryPool* pool) {
+  VELOX_NVTX_FUNC_RANGE();
+  auto totalCount = 0;
+  for (const auto& result : results) {
+    totalCount += result->size();
+  }
+  auto copy =
+      BaseVector::create<RowVector>(results[0]->type(), totalCount, pool);
+  auto copyCount = 0;
+  for (const auto& result : results) {
+    copy->copy(result.get(), copyCount, 0, result->size());
+    copyCount += result->size();
+  }
+  return copy;
+}
+
+cudf::size_type preferred_gpu_batch_size_rows() {
+  constexpr cudf::size_type default_gpu_batch_size_rows = 100000;
+  const char* env_cudf_gpu_batch_size_rows =
+      std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS");
+  return env_cudf_gpu_batch_size_rows != nullptr
+      ? std::stoi(env_cudf_gpu_batch_size_rows)
+      : default_gpu_batch_size_rows;
+}
+} // namespace
+
+CudfFromVelox::CudfFromVelox(
+    int32_t operatorId,
+    RowTypePtr outputType,
+    exec::DriverCtx* driverCtx,
+    std::string planNodeId)
+    : exec::Operator(
+          driverCtx,
+          outputType,
+          operatorId,
+          planNodeId,
+          "CudfFromVelox"),
+      NvtxHelper(nvtx3::rgb{255, 140, 0}, operatorId) {} // Orange
+
+void CudfFromVelox::addInput(RowVectorPtr input) {
+  VELOX_NVTX_OPERATOR_FUNC_RANGE();
+  if (input != nullptr) {
+    if (input->size() > 0) {
+      // Materialize lazy vectors
+      for (auto& child : input->children()) {
+        child->loadedVector();
+      }
+      input->loadedVector();
+
+      // Accumulate inputs
+      inputs_.push_back(input);
+      current_output_size_ += input->size();
+    }
+  }
+}
+
+RowVectorPtr CudfFromVelox::getOutput() {
+  VELOX_NVTX_OPERATOR_FUNC_RANGE();
+  auto const target_output_size = preferred_gpu_batch_size_rows();
+  auto const exit_early = finished_ or
+      (current_output_size_ < target_output_size and not noMoreInput_) or
+      inputs_.empty();
+  finished_ = noMoreInput_;
+  if (exit_early) {
+    return nullptr;
+  }
+
+  // Combine all input RowVectors into a single RowVector and clear inputs
+  auto input = mergeRowVectors(inputs_, inputs_[0]->pool());
+  inputs_.clear();
+  current_output_size_ = 0;
+
+  // Early return if no input
+  if (input->size() == 0) {
+    return nullptr;
+  }
+
+  // Get a stream from the global stream pool
+  auto stream = cudfGlobalStreamPool().get_stream();
+
+  // Convert RowVector to cudf table
+  auto tbl = with_arrow::to_cudf_table(input, input->pool(), stream);
+
+  stream.synchronize();
+
+  VELOX_CHECK_NOT_NULL(tbl);
+
+  if (cudfDebugEnabled()) {
+    std::cout << "CudfFromVelox table number of columns: " << tbl->num_columns()
+              << std::endl;
+    std::cout << "CudfFromVelox table number of rows: " << tbl->num_rows()
+              << std::endl;
+  }
+
+  // Return a CudfVector that owns the cudf table
+  auto const size = tbl->num_rows();
+  return std::make_shared<CudfVector>(
+      input->pool(), outputType_, size, std::move(tbl), stream);
+}
+
+void CudfFromVelox::close() {
+  cudf::get_default_stream().synchronize();
+  exec::Operator::close();
+}
+
+CudfToVelox::CudfToVelox(
+    int32_t operatorId,
+    RowTypePtr outputType,
+    exec::DriverCtx* driverCtx,
+    std::string planNodeId)
+    : exec::Operator(
+          driverCtx,
+          outputType,
+          operatorId,
+          planNodeId,
+          "CudfToVelox"),
+      NvtxHelper(nvtx3::rgb{148, 0, 211}, operatorId) {} // Purple
+
+void CudfToVelox::addInput(RowVectorPtr input) {
+  // Accumulate inputs
+  if (input->size() > 0) {
+    auto cudf_input = std::dynamic_pointer_cast<CudfVector>(input);
+    VELOX_CHECK_NOT_NULL(cudf_input);
+    inputs_.push_back(std::move(cudf_input));
+  }
+}
+
+RowVectorPtr CudfToVelox::getOutput() {
+  VELOX_NVTX_OPERATOR_FUNC_RANGE();
+  if (finished_ || inputs_.empty()) {
+    finished_ = noMoreInput_ && inputs_.empty();
+    return nullptr;
+  }
+
+  auto stream = inputs_.front()->stream();
+  std::unique_ptr<cudf::table> tbl = inputs_.front()->release();
+  inputs_.pop_front();
+
+  VELOX_CHECK_NOT_NULL(tbl);
+  if (cudfDebugEnabled()) {
+    std::cout << "CudfToVelox table number of columns: " << tbl->num_columns()
+              << std::endl;
+    std::cout << "CudfToVelox table number of rows: " << tbl->num_rows()
+              << std::endl;
+  }
+  if (tbl->num_rows() == 0) {
+    return nullptr;
+  }
+  RowVectorPtr output =
+      with_arrow::to_velox_column(tbl->view(), pool(), "", stream);
+  stream.synchronize();
+  finished_ = noMoreInput_ && inputs_.empty();
+  output->setType(outputType_);
+  return output;
+}
+
+void CudfToVelox::close() {
+  exec::Operator::close();
+  // TODO: Release stored inputs if needed
+  // TODO: Release cudf memory resources
+}
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
new file mode 100644
index 00000000000..c75f8464b36
--- /dev/null
+++ b/velox/experimental/cudf/exec/CudfConversion.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/exec/Driver.h"
+#include "velox/exec/Operator.h"
+#include "velox/vector/ComplexVector.h"
+
+#include <cudf/table/table.hpp>
+
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+namespace facebook::velox::cudf_velox {
+
+class CudfFromVelox : public exec::Operator, public NvtxHelper {
+ public:
+  CudfFromVelox(
+      int32_t operatorId,
+      RowTypePtr outputType,
+      exec::DriverCtx* driverCtx,
+      std::string planNodeId);
+
+  bool needsInput() const override {
+    return !finished_;
+  }
+
+  void addInput(RowVectorPtr input) override;
+
+  RowVectorPtr getOutput() override;
+
+  exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override {
+    return exec::BlockingReason::kNotBlocked;
+  }
+
+  bool isFinished() override {
+    return finished_;
+  }
+
+  void close() override;
+
+ private:
+  std::vector<RowVectorPtr> inputs_;
+  std::size_t current_output_size_ = 0;
+  bool finished_ = false;
+};
+
+class CudfToVelox : public exec::Operator, public NvtxHelper {
+ public:
+  CudfToVelox(
+      int32_t operatorId,
+      RowTypePtr outputType,
+      exec::DriverCtx* driverCtx,
+      std::string planNodeId);
+
+  bool needsInput() const override {
+    return !finished_;
+  }
+
+  void addInput(RowVectorPtr input) override;
+
+  RowVectorPtr getOutput() override;
+
+  exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override {
+    return exec::BlockingReason::kNotBlocked;
+  }
+
+  bool isFinished() override {
+    return finished_;
+  }
+
+  void close() override;
+
+ private:
+  std::deque<CudfVectorPtr> inputs_;
+  bool finished_ = false;
+};
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
new file mode 100644
index 00000000000..ac32cc8e23f
--- /dev/null
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/exec/Driver.h"
+#include "velox/exec/Operator.h"
+#include "velox/vector/ComplexVector.h"
+
+#include <cudf/concatenate.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include "velox/experimental/cudf/exec/CudfOrderBy.h"
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
+
+namespace facebook::velox::cudf_velox {
+
+CudfOrderBy::CudfOrderBy(
+    int32_t operatorId,
+    exec::DriverCtx* driverCtx,
+    const std::shared_ptr<const core::OrderByNode>& orderByNode)
+    : exec::Operator(
+          driverCtx,
+          orderByNode->outputType(),
+          operatorId,
+          orderByNode->id(),
+          "CudfOrderBy"),
+      NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise
+      orderByNode_(orderByNode) {
+  maxOutputRows_ = outputBatchRows(std::nullopt);
+  sort_keys_.reserve(orderByNode->sortingKeys().size());
+  column_order_.reserve(orderByNode->sortingKeys().size());
+  null_order_.reserve(orderByNode->sortingKeys().size());
+  for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) {
+    const auto channel =
+        exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_);
+    VELOX_CHECK(
+        channel != kConstantChannel,
+        "OrderBy doesn't allow constant sorting keys");
+    sort_keys_.push_back(channel);
+    auto const& sorting_order = orderByNode->sortingOrders()[i];
+    column_order_.push_back(
+        sorting_order.isAscending() ? cudf::order::ASCENDING
+                                    : cudf::order::DESCENDING);
+    null_order_.push_back(
+        (sorting_order.isNullsFirst() ^ !sorting_order.isAscending())
+            ? cudf::null_order::BEFORE
+            : cudf::null_order::AFTER);
+  }
+  if (cudfDebugEnabled()) {
+    std::cout << "Number of Sort keys: " << sort_keys_.size() << std::endl;
+  }
+}
+
+void CudfOrderBy::addInput(RowVectorPtr input) {
+  // Accumulate inputs
+  if (input->size() > 0) {
+    auto cudf_input = std::dynamic_pointer_cast<CudfVector>(input);
+    VELOX_CHECK_NOT_NULL(cudf_input);
+    inputs_.push_back(std::move(cudf_input));
+  }
+}
+
+void CudfOrderBy::noMoreInput() {
+  exec::Operator::noMoreInput();
+  // TODO: Get total row count, batch output
+  // maxOutputRows_ = outputBatchRows(total_row_count);
+
+  VELOX_NVTX_OPERATOR_FUNC_RANGE();
+
+  if (inputs_.empty()) {
+    return;
+  }
+
+  auto stream = cudfGlobalStreamPool().get_stream();
+  auto tbl = getConcatenatedTable(inputs_, stream);
+
+  // Release input data after synchronizing
+  stream.synchronize();
+  inputs_.clear();
+
+  VELOX_CHECK_NOT_NULL(tbl);
+  if (cudfDebugEnabled()) {
+    std::cout << "Sort input table number of columns: " << tbl->num_columns()
+              << std::endl;
+    std::cout << "Sort input table number of rows: " << tbl->num_rows()
+              << std::endl;
+  }
+
+  auto keys = tbl->view().select(sort_keys_);
+  auto values = tbl->view();
+  auto result =
+      cudf::sort_by_key(values, keys, column_order_, null_order_, stream);
+  auto const size = result->num_rows();
+  outputTable_ = std::make_shared<CudfVector>(
+      pool(), outputType_, size, std::move(result), stream);
+}
+
+RowVectorPtr CudfOrderBy::getOutput() {
+  if (finished_ || !noMoreInput_) {
+    return nullptr;
+  }
+  finished_ = noMoreInput_;
+  return outputTable_;
+}
+
+void CudfOrderBy::close() {
+  exec::Operator::close();
+  // Release stored inputs
+  // Release cudf memory resources
+  inputs_.clear();
+  outputTable_.reset();
+}
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h
new file mode 100644
index 00000000000..28c89cec8e4
--- /dev/null
+++ b/velox/experimental/cudf/exec/CudfOrderBy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/core/Expressions.h"
+#include "velox/core/PlanNode.h"
+#include "velox/exec/Driver.h"
+#include "velox/exec/Operator.h"
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/vector/CudfVector.h"
+#include "velox/vector/ComplexVector.h"
+
+#include <cudf/table/table.hpp>
+
+namespace facebook::velox::cudf_velox {
+
+class CudfOrderBy : public exec::Operator, public NvtxHelper {
+ public:
+  CudfOrderBy(
+      int32_t operatorId,
+      exec::DriverCtx* driverCtx,
+      const std::shared_ptr<const core::OrderByNode>& orderByNode);
+
+  bool needsInput() const override {
+    return !finished_;
+  }
+
+  void addInput(RowVectorPtr input) override;
+
+  void noMoreInput() override;
+
+  RowVectorPtr getOutput() override;
+
+  exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override {
+    return exec::BlockingReason::kNotBlocked;
+  }
+
+  bool isFinished() override {
+    return finished_;
+  }
+
+  void close() override;
+
+ private:
+  CudfVectorPtr outputTable_;
+  std::shared_ptr<const core::OrderByNode> orderByNode_;
+  std::vector<CudfVectorPtr> inputs_;
+  std::vector<cudf::size_type> sort_keys_;
+  std::vector<cudf::order> column_order_;
+  std::vector<cudf::null_order> null_order_;
+  bool finished_{false};
+  uint32_t maxOutputRows_;
+};
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h
new file mode 100644
index 00000000000..4e4efca7a08
--- /dev/null
+++ b/velox/experimental/cudf/exec/NvtxHelper.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nvtx3/nvtx3.hpp>
+#include <optional>
+
+namespace facebook::velox::cudf_velox {
+
+class NvtxHelper {
+ public:
+  NvtxHelper();
+  NvtxHelper(nvtx3::color color, std::optional<int64_t> payload = std::nullopt)
+      : color_(color), payload_(payload) {}
+
+  nvtx3::color color_{nvtx3::rgb{125, 125, 125}}; // Gray
+  std::optional<int64_t> payload_{};
+};
+
+/**
+ * @brief Tag type for Velox's NVTX domain.
+ */
+struct velox_domain {
+  static constexpr char const* name{"velox"};
+};
+
+using nvtx_registered_string_t = nvtx3::registered_string_in<velox_domain>;
+
+#define VELOX_NVTX_OPERATOR_FUNC_RANGE()                                         \
+  static_assert(                                                                 \
+      std::is_base_of<NvtxHelper, std::remove_pointer<decltype(this)>::type>::   \
+          value,                                                                 \
+      "VELOX_NVTX_OPERATOR_FUNC_RANGE can only be used"                          \
+      " in Operators derived from NvtxHelper");                                  \
+  static nvtx_registered_string_t const nvtx3_func_name__{                       \
+      std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)};           \
+  static ::nvtx3::event_attributes const nvtx3_func_attr__{                    \
+      this->payload_.has_value() ?                                             \
+          ::nvtx3::event_attributes{nvtx3_func_name__, this->color_,           \
+                                   nvtx3::payload{this->payload_.value()}} :   \
+          ::nvtx3::event_attributes{nvtx3_func_name__, this->color_}}; \
+  ::nvtx3::scoped_range_in<velox_domain> const nvtx3_range__{nvtx3_func_attr__};
+
+#define VELOX_NVTX_PRETTY_FUNC_RANGE()                                         \
+  static nvtx_registered_string_t const nvtx3_func_name__{                     \
+      std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)};         \
+  static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+  ::nvtx3::scoped_range_in<velox_domain> const nvtx3_range__{nvtx3_func_attr__};
+
+#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(velox_domain)
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
new file mode 100644
index 00000000000..03ec29b2c95
--- /dev/null
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/experimental/cudf/exec/ToCudf.h"
+#include "velox/exec/Driver.h"
+#include "velox/exec/FilterProject.h"
+#include "velox/exec/HashAggregation.h"
+#include "velox/exec/HashBuild.h"
+#include "velox/exec/HashProbe.h"
+#include "velox/exec/Operator.h"
+#include "velox/exec/OrderBy.h"
+#include "velox/experimental/cudf/exec/CudfConversion.h"
+#include "velox/experimental/cudf/exec/CudfFilterProject.h"
+#include "velox/experimental/cudf/exec/CudfHashAggregation.h"
+#include "velox/experimental/cudf/exec/CudfHashJoin.h"
+#include "velox/experimental/cudf/exec/CudfLocalPartition.h"
+#include "velox/experimental/cudf/exec/CudfOrderBy.h"
+#include "velox/experimental/cudf/exec/ExpressionEvaluator.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+
+#include <cuda.h>
+
+#include <iostream>
+
+namespace facebook::velox::cudf_velox {
+
+template <class... Deriveds, class Base>
+bool is_any_of(const Base* p) {
+  return ((dynamic_cast<const Deriveds*>(p) != nullptr) || ...);
+}
+
+static bool _cudfIsRegistered = false;
+
+bool CompileState::compile() {
+  if (cudfDebugEnabled()) {
+    std::cout << "Calling cudfDriverAdapter" << std::endl;
+  }
+
+  auto operators = driver_.operators();
+  auto& nodes = planNodes_;
+
+  if (cudfDebugEnabled()) {
+    std::cout << "Number of operators: " << operators.size() << std::endl;
+    for (auto& op : operators) {
+      std::cout << "  Operator: ID " << op->operatorId() << ": "
+                << op->toString() << std::endl;
+    }
+    std::cout << "Number of plan nodes: " << nodes.size() << std::endl;
+    for (auto& node : nodes) {
+      std::cout << "  Plan node: ID " << node->id() << ": " << node->toString();
+    }
+  }
+
+  // Make sure operator states are initialized.  We will need to inspect some of
+  // them during the transformation.
+  driver_.initializeOperators();
+
+  bool replacements_made = false;
+  auto ctx = driver_.driverCtx();
+
+  // Get plan node by id lookup.
+  auto get_plan_node = [&](const core::PlanNodeId& id) {
+    auto it =
+        std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) {
+          return node->id() == id;
+        });
+    VELOX_CHECK(it != nodes.end());
+    return *it;
+  };
+
+  auto is_filter_project_supported = [](const exec::Operator* op) {
+    if (auto filter_project_op = dynamic_cast<const exec::FilterProject*>(op)) {
+      auto info = filter_project_op->exprsAndProjection();
+      return !info.hasFilter &&
+          ExpressionEvaluator::can_be_evaluated(info.exprs->exprs());
+    }
+    return false;
+  };
+
+  auto is_join_supported = [get_plan_node](const exec::Operator* op) {
+    if (!is_any_of<exec::HashBuild, exec::HashProbe>(op)) {
+      return false;
+    }
+    auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
+        get_plan_node(op->planNodeId()));
+    if (!plan_node) {
+      return false;
+    }
+    if (!plan_node->isInnerJoin()) {
+      return false;
+    }
+    if (plan_node->filter() != nullptr) {
+      return false;
+    }
+    return true;
+  };
+
+  auto is_supported_gpu_operator =
+      [is_filter_project_supported,
+       is_join_supported](const exec::Operator* op) {
+        return is_any_of<
+                   exec::OrderBy,
+                   exec::HashAggregation,
+                   exec::LocalPartition,
+                   exec::LocalExchange>(op) ||
+            is_filter_project_supported(op) || is_join_supported(op);
+      };
+
+  std::vector<bool> is_supported_gpu_operators(operators.size());
+  std::transform(
+      operators.begin(),
+      operators.end(),
+      is_supported_gpu_operators.begin(),
+      is_supported_gpu_operator);
+  auto accepts_gpu_input = [is_filter_project_supported,
+                            is_join_supported](const exec::Operator* op) {
+    return is_any_of<
+               exec::OrderBy,
+               exec::HashAggregation,
+               exec::LocalPartition>(op) ||
+        is_filter_project_supported(op) || is_join_supported(op);
+  };
+  auto produces_gpu_output = [is_filter_project_supported,
+                              is_join_supported](const exec::Operator* op) {
+    return is_any_of<exec::OrderBy, exec::HashAggregation, exec::LocalExchange>(
+               op) ||
+        is_filter_project_supported(op) ||
+        (is_any_of<exec::HashProbe>(op) && is_join_supported(op));
+  };
+
+  int32_t operatorsOffset = 0;
+  for (int32_t operatorIndex = 0; operatorIndex < operators.size();
+       ++operatorIndex) {
+    std::vector<std::unique_ptr<exec::Operator>> replace_op;
+
+    exec::Operator* oper = operators[operatorIndex];
+    auto replacingOperatorIndex = operatorIndex + operatorsOffset;
+    VELOX_CHECK(oper);
+
+    bool const previous_operator_is_not_gpu =
+        (operatorIndex > 0 and !is_supported_gpu_operators[operatorIndex - 1]);
+    bool const next_operator_is_not_gpu =
+        (operatorIndex < operators.size() - 1 and
+         !is_supported_gpu_operators[operatorIndex + 1]);
+
+    auto id = oper->operatorId();
+    if (previous_operator_is_not_gpu and accepts_gpu_input(oper)) {
+      auto plan_node = get_plan_node(oper->planNodeId());
+      replace_op.push_back(std::make_unique<CudfFromVelox>(
+          id, plan_node->outputType(), ctx, plan_node->id() + "-from-velox"));
+      replace_op.back()->initialize();
+    }
+
+    // This is used to denote if the current operator is kept or replaced.
+    auto keep_operator = 0;
+    if (is_join_supported(oper)) {
+      if (auto joinBuildOp = dynamic_cast<exec::HashBuild*>(oper)) {
+        auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
+            get_plan_node(joinBuildOp->planNodeId()));
+        VELOX_CHECK(plan_node != nullptr);
+        // From-Velox (optional)
+        replace_op.push_back(
+            std::make_unique<CudfHashJoinBuild>(id, ctx, plan_node));
+        replace_op.back()->initialize();
+      } else if (auto joinProbeOp = dynamic_cast<exec::HashProbe*>(oper)) {
+        auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
+            get_plan_node(joinProbeOp->planNodeId()));
+        VELOX_CHECK(plan_node != nullptr);
+        // From-Velox (optional)
+        replace_op.push_back(
+            std::make_unique<CudfHashJoinProbe>(id, ctx, plan_node));
+        replace_op.back()->initialize();
+        // To-Velox (optional)
+      }
+    } else if (auto orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
+      auto id = orderByOp->operatorId();
+      auto plan_node = std::dynamic_pointer_cast<const core::OrderByNode>(
+          get_plan_node(orderByOp->planNodeId()));
+      VELOX_CHECK(plan_node != nullptr);
+      // From-velox (optional)
+      replace_op.push_back(std::make_unique<CudfOrderBy>(id, ctx, plan_node));
+      replace_op.back()->initialize();
+      // To-velox (optional)
+    } else if (auto hashAggOp = dynamic_cast<exec::HashAggregation*>(oper)) {
+      auto plan_node = std::dynamic_pointer_cast<const core::AggregationNode>(
+          get_plan_node(hashAggOp->planNodeId()));
+      VELOX_CHECK(plan_node != nullptr);
+      replace_op.push_back(
+          std::make_unique<CudfHashAggregation>(id, ctx, plan_node));
+      replace_op.back()->initialize();
+    } else if (is_filter_project_supported(oper)) {
+      auto filterProjectOp = dynamic_cast<exec::FilterProject*>(oper);
+      auto info = filterProjectOp->exprsAndProjection();
+      auto& id_projections = filterProjectOp->identityProjections();
+      auto plan_node = std::dynamic_pointer_cast<const core::ProjectNode>(
+          get_plan_node(filterProjectOp->planNodeId()));
+      // If filter doesn't exist then project should definitely exist so this
+      // should never hit
+      VELOX_CHECK(plan_node != nullptr);
+      replace_op.push_back(std::make_unique<CudfFilterProject>(
+          id, ctx, info, id_projections, nullptr, plan_node));
+      replace_op.back()->initialize();
+    } else if (
+        auto localPartitionOp = dynamic_cast<exec::LocalPartition*>(oper)) {
+      auto plan_node =
+          std::dynamic_pointer_cast<const core::LocalPartitionNode>(
+              get_plan_node(localPartitionOp->planNodeId()));
+      VELOX_CHECK(plan_node != nullptr);
+      replace_op.push_back(
+          std::make_unique<CudfLocalPartition>(id, ctx, plan_node));
+      replace_op.back()->initialize();
+    }
+
+    if (next_operator_is_not_gpu and produces_gpu_output(oper)) {
+      auto plan_node = get_plan_node(oper->planNodeId());
+      replace_op.push_back(std::make_unique<CudfToVelox>(
+          id, plan_node->outputType(), ctx, plan_node->id() + "-to-velox"));
+      replace_op.back()->initialize();
+    }
+
+    if (not replace_op.empty()) {
+      operatorsOffset +=
+          replace_op.size() - 1 + keep_operator; // Check this "- 1"
+      [[maybe_unused]] auto replaced = driverFactory_.replaceOperators(
+          driver_,
+          replacingOperatorIndex + keep_operator,
+          replacingOperatorIndex + 1,
+          std::move(replace_op));
+      replacements_made = true;
+    }
+  }
+
+  if (cudfDebugEnabled()) {
+    operators = driver_.operators();
+    std::cout << "Number of new operators: " << operators.size() << std::endl;
+    for (auto& op : operators) {
+      std::cout << "  Operator: ID " << op->operatorId() << ": "
+                << op->toString() << std::endl;
+    }
+  }
+  return replacements_made;
+}
+
+struct cudfDriverAdapter {
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+  std::shared_ptr<std::vector<std::shared_ptr<core::PlanNode const>>>
+      planNodes_;
+
+  cudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
+      : mr_(mr) {
+    if (cudfDebugEnabled()) {
+      std::cout << "cudfDriverAdapter constructor" << std::endl;
+    }
+    planNodes_ =
+        std::make_shared<std::vector<std::shared_ptr<core::PlanNode const>>>();
+  }
+
+  ~cudfDriverAdapter() {
+    if (cudfDebugEnabled()) {
+      std::cout << "cudfDriverAdapter destructor" << std::endl;
+      printf(
+          "cached planNodes_ %p, %ld\n",
+          planNodes_.get(),
+          planNodes_.use_count());
+    }
+  }
+
+  // Call operator needed by DriverAdapter
+  bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) {
+    auto state = CompileState(factory, driver, *planNodes_);
+    // Stored planNodes_ from inspect.
+    if (cudfDebugEnabled()) {
+      printf("driver.planNodes_=%p\n", planNodes_.get());
+    }
+    auto res = state.compile();
+    return res;
+  }
+
+  // Iterate recursively and store them in the planNodes_.
+  void storePlanNodes(const std::shared_ptr<const core::PlanNode>& planNode) {
+    const auto& sources = planNode->sources();
+    for (int32_t i = 0; i < sources.size(); ++i) {
+      storePlanNodes(sources[i]);
+    }
+    planNodes_->push_back(planNode);
+  }
+
+  // Call operator needed by plan inspection
+  void operator()(const core::PlanFragment& planFragment) {
+    // signature: std::function<void(const core::PlanFragment&)> inspect;
+    // call: adapter.inspect(planFragment);
+    planNodes_->clear();
+    if (cudfDebugEnabled()) {
+      std::cout << "Inspecting PlanFragment" << std::endl;
+    }
+    if (planNodes_) {
+      storePlanNodes(planFragment.planNode);
+    }
+  }
+};
+
+void registerCudf() {
+  const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED");
+  if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) {
+    return;
+  }
+
+  CUDF_FUNC_RANGE();
+  cudaFree(0); // to init context.
+
+  if (cudfDebugEnabled()) {
+    std::cout << "Registering CudfHashJoinBridgeTranslator" << std::endl;
+  }
+  exec::Operator::registerOperator(
+      std::make_unique<CudfHashJoinBridgeTranslator>());
+  if (cudfDebugEnabled()) {
+    std::cout << "Registering cudfDriverAdapter" << std::endl;
+  }
+
+  const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
+  auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async";
+  if (cudfDebugEnabled()) {
+    std::cout << "Setting cuDF memory resource to " << mr_mode << std::endl;
+  }
+  auto mr = cudf_velox::create_memory_resource(mr_mode);
+  cudf::set_current_device_resource(mr.get());
+  cudfDriverAdapter cda{mr};
+  exec::DriverAdapter cudfAdapter{"cuDF", cda, cda};
+  exec::DriverFactory::registerAdapter(cudfAdapter);
+  _cudfIsRegistered = true;
+}
+
+void unregisterCudf() {
+  if (cudfDebugEnabled()) {
+    std::cout << "Unregistering cudfDriverAdapter" << std::endl;
+  }
+  exec::DriverFactory::adapters.clear();
+  _cudfIsRegistered = false;
+}
+
+bool cudfIsRegistered() {
+  return _cudfIsRegistered;
+}
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h
new file mode 100644
index 00000000000..8da0eba26ae
--- /dev/null
+++ b/velox/experimental/cudf/exec/ToCudf.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/exec/Driver.h"
+#include "velox/exec/Operator.h"
+
+namespace facebook::velox::cudf_velox {
+
+class CompileState {
+ public:
+  CompileState(
+      const exec::DriverFactory& driverFactory,
+      exec::Driver& driver,
+      std::vector<std::shared_ptr<core::PlanNode const>>& planNodes)
+      : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {}
+
+  exec::Driver& driver() {
+    return driver_;
+  }
+
+  // Replaces sequences of Operators in the Driver given at construction with
+  // cuDF equivalents. Returns true if the Driver was changed.
+  bool compile();
+
+  const exec::DriverFactory& driverFactory_;
+  exec::Driver& driver_;
+  const std::vector<std::shared_ptr<core::PlanNode const>>& planNodes_;
+};
+
+/// Registers adapter to add cuDF operators to Drivers.
+void registerCudf();
+void unregisterCudf();
+
+/// Returns true if cuDF is registered.
+bool cudfIsRegistered();
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
new file mode 100644
index 00000000000..f12b08a341c
--- /dev/null
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <memory>
+#include <string_view>
+
+#include "velox/experimental/cudf/exec/Utilities.h"
+
+#include <common/base/Exceptions.h>
+
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+namespace facebook::velox::cudf_velox {
+
+namespace {
+auto make_cuda_mr() {
+  return std::make_shared<rmm::mr::cuda_memory_resource>();
+}
+
+auto make_pool_mr() {
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      make_cuda_mr(), rmm::percent_of_free_device_memory(50));
+}
+
+auto make_async_mr() {
+  return std::make_shared<rmm::mr::cuda_async_memory_resource>();
+}
+
+auto make_managed_mr() {
+  return std::make_shared<rmm::mr::managed_memory_resource>();
+}
+
+auto make_arena_mr() {
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
+      make_cuda_mr());
+}
+
+auto make_managed_pool_mr() {
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      make_managed_mr(), rmm::percent_of_free_device_memory(50));
+}
+} // namespace
+
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+    std::string_view mode) {
+  if (mode == "cuda")
+    return make_cuda_mr();
+  if (mode == "pool")
+    return make_pool_mr();
+  if (mode == "async")
+    return make_async_mr();
+  if (mode == "arena")
+    return make_arena_mr();
+  if (mode == "managed")
+    return make_managed_mr();
+  if (mode == "managed_pool")
+    return make_managed_pool_mr();
+  throw cudf::logic_error(
+      "Unknown memory resource mode: " + std::string(mode) +
+      "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
+}
+
+cudf::detail::cuda_stream_pool& cudfGlobalStreamPool() {
+  return cudf::detail::global_cuda_stream_pool();
+};
+
+bool cudfDebugEnabled() {
+  const char* env_cudf_debug = std::getenv("VELOX_CUDF_DEBUG");
+  return env_cudf_debug != nullptr && std::stoi(env_cudf_debug);
+}
+
+std::unique_ptr<cudf::table> concatenateTables(
+    std::vector<std::unique_ptr<cudf::table>> tables,
+    rmm::cuda_stream_view stream) {
+  // Check for empty vector
+  VELOX_CHECK_GT(tables.size(), 0);
+
+  if (tables.size() == 1) {
+    return std::move(tables[0]);
+  }
+  std::vector<cudf::table_view> tableViews;
+  tableViews.reserve(tables.size());
+  std::transform(
+      tables.begin(),
+      tables.end(),
+      std::back_inserter(tableViews),
+      [&](auto const& tbl) { return tbl->view(); });
+  return cudf::concatenate(
+      tableViews, stream, cudf::get_current_device_resource_ref());
+}
+
+std::unique_ptr<cudf::table> getConcatenatedTable(
+    std::vector<CudfVectorPtr>& tables,
+    rmm::cuda_stream_view stream) {
+  // Check for empty vector
+  VELOX_CHECK_GT(tables.size(), 0);
+
+  auto inputStreams = std::vector<rmm::cuda_stream_view>();
+  auto tableViews = std::vector<cudf::table_view>();
+
+  inputStreams.reserve(tables.size());
+  tableViews.reserve(tables.size());
+
+  for (auto const& table : tables) {
+    VELOX_CHECK_NOT_NULL(table);
+    tableViews.push_back(table->getTableView());
+    inputStreams.push_back(table->stream());
+  }
+
+  cudf::detail::join_streams(inputStreams, stream);
+
+  if (tables.size() == 1) {
+    return tables[0]->release();
+  }
+
+  auto output = cudf::concatenate(
+      tableViews, stream, cudf::get_current_device_resource_ref());
+  stream.synchronize();
+  return output;
+}
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h
new file mode 100644
index 00000000000..87a496a8cff
--- /dev/null
+++ b/velox/experimental/cudf/exec/Utilities.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string_view>
+
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/table/table.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace facebook::velox::cudf_velox {
+
+/**
+ * @brief Creates a memory resource based on the given mode.
+ */
+[[nodiscard]] std::shared_ptr<rmm::mr::device_memory_resource>
+create_memory_resource(std::string_view mode);
+
+/**
+ * @brief Returns the global CUDA stream pool used by cudf.
+ */
+[[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool();
+
+/**
+ * @brief Returns true if the VELOX_CUDF_DEBUG environment variable is set to a
+ * nonzero value.
+ */
+bool cudfDebugEnabled();
+
+// Concatenate a vector of cuDF tables into a single table
+std::unique_ptr<cudf::table> concatenateTables(
+    std::vector<std::unique_ptr<cudf::table>> tables,
+    rmm::cuda_stream_view stream);
+
+// Concatenate a vector of cuDF tables into a single table.
+// This function joins the streams owned by individual tables on the passed
+// stream. Inputs are not safe to use after calling this function.
+std::unique_ptr<cudf::table> getConcatenatedTable(
+    std::vector<CudfVectorPtr>& tables,
+    rmm::cuda_stream_view stream);
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
new file mode 100644
index 00000000000..3fd2b046787
--- /dev/null
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/common/memory/Memory.h"
+#include "velox/type/Type.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/DictionaryVector.h"
+#include "velox/vector/FlatVector.h"
+#include "velox/vector/arrow/Bridge.h"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nvtx3/nvtx3.hpp>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
+
+#include <functional>
+#include <numeric>
+
+#include <arrow/c/bridge.h>
+#include <arrow/io/interfaces.h>
+#include <arrow/table.h>
+
+namespace facebook::velox::cudf_velox {
+
+namespace {
+
+template <typename Value, typename Functor, typename... Ts>
+constexpr decltype(auto)
+vector_encoding_dispatcher(VectorPtr vec, Functor f, Ts&&... args) {
+  using facebook::velox::VectorEncoding::Simple;
+  switch (vec->encoding()) {
+    case Simple::FLAT:
+      return f(vec->as<FlatVector<Value>>(), std::forward<Ts>(args)...);
+    case Simple::DICTIONARY:
+      return f(vec->as<DictionaryVector<Value>>(), std::forward<Ts>(args)...);
+    default: {
+      if (cudfDebugEnabled()) {
+        std::cout << "Unsupported Velox encoding: " << vec->encoding()
+                  << std::endl;
+      }
+      CUDF_FAIL("Unsupported Velox encoding");
+    }
+  }
+}
+
+// TODO: dispatch other duration/timestamp types!
+template <typename T>
+using cudf_storage_type_t = std::conditional_t<
+    std::is_same_v<cudf::timestamp_D, T>,
+    cudf::timestamp_D::rep,
+    cudf::device_storage_type_t<T>>;
+
+} // namespace
+
+cudf::type_id velox_to_cudf_type_id(const TypePtr& type) {
+  if (cudfDebugEnabled()) {
+    std::cout << "Converting Velox type " << type->toString() << " to cudf"
+              << std::endl;
+  }
+  switch (type->kind()) {
+    case TypeKind::BOOLEAN:
+      return cudf::type_id::BOOL8;
+    case TypeKind::TINYINT:
+      return cudf::type_id::INT8;
+    case TypeKind::SMALLINT:
+      return cudf::type_id::INT16;
+    case TypeKind::INTEGER:
+      // TODO: handle interval types (durations?)
+      // if (type->isIntervalYearMonth()) {
+      //   return cudf::type_id::...;
+      // }
+      if (type->isDate()) {
+        return cudf::type_id::TIMESTAMP_DAYS;
+      }
+      return cudf::type_id::INT32;
+    case TypeKind::BIGINT:
+      return cudf::type_id::INT64;
+    case TypeKind::REAL:
+      return cudf::type_id::FLOAT32;
+    case TypeKind::DOUBLE:
+      return cudf::type_id::FLOAT64;
+    case TypeKind::VARCHAR:
+      return cudf::type_id::STRING;
+    case TypeKind::VARBINARY:
+      return cudf::type_id::STRING;
+    case TypeKind::TIMESTAMP:
+      return cudf::type_id::TIMESTAMP_NANOSECONDS;
+    // case TypeKind::HUGEINT: return cudf::type_id::DURATION_DAYS;
+    // TODO: DATE was converted to a logical type:
+    // https://github.com/facebookincubator/velox/commit/e480f5c03a6c47897ef4488bd56918a89719f908
+    // case TypeKind::DATE: return cudf::type_id::DURATION_DAYS;
+    // case TypeKind::INTERVAL_DAY_TIME: return cudf::type_id::EMPTY;
+    // TODO: Decimals are now logical types:
+    // https://github.com/facebookincubator/velox/commit/73d2f935b55f084d30557c7be94b9768efb8e56f
+    // case TypeKind::SHORT_DECIMAL: return cudf::type_id::DECIMAL64;
+    // case TypeKind::LONG_DECIMAL: return cudf::type_id::DECIMAL128;
+    // case TypeKind::ARRAY: return cudf::type_id::EMPTY;
+    // case TypeKind::MAP: return cudf::type_id::EMPTY;
+    case TypeKind::ROW:
+      return cudf::type_id::STRUCT;
+    // case TypeKind::UNKNOWN: return cudf::type_id::EMPTY;
+    // case TypeKind::FUNCTION: return cudf::type_id::EMPTY;
+    // case TypeKind::OPAQUE: return cudf::type_id::EMPTY;
+    // case TypeKind::INVALID: return cudf::type_id::EMPTY;
+    default:
+      CUDF_FAIL("Unsupported Velox type");
+      return cudf::type_id::EMPTY;
+  }
+}
+
+TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id) {
+  switch (type_id) {
+    case cudf::type_id::BOOL8:
+      return BOOLEAN();
+    case cudf::type_id::INT8:
+      return TINYINT();
+    case cudf::type_id::INT16:
+      return SMALLINT();
+    case cudf::type_id::INT32:
+      return INTEGER();
+    case cudf::type_id::INT64:
+      return BIGINT();
+    case cudf::type_id::FLOAT32:
+      return REAL();
+    case cudf::type_id::FLOAT64:
+      return DOUBLE();
+    case cudf::type_id::STRING:
+      return VARCHAR();
+    case cudf::type_id::TIMESTAMP_DAYS:
+      return DATE();
+    case cudf::type_id::TIMESTAMP_NANOSECONDS:
+      return TIMESTAMP();
+    // TODO: DATE is now a logical type
+    // case cudf::type_id::DURATION_DAYS: return ???;
+    // case cudf::type_id::EMPTY: return TypeKind::INTERVAL_DAY_TIME;
+    // TODO: DECIMAL is now a logical type
+    // case cudf::type_id::DECIMAL64: return TypeKind::SHORT_DECIMAL;
+    // case cudf::type_id::DECIMAL128: return TypeKind::LONG_DECIMAL;
+    // case cudf::type_id::EMPTY: return TypeKind::ARRAY;
+    // case cudf::type_id::EMPTY: return TypeKind::MAP;
+    // case cudf::type_id::STRUCT:
+    //   // TODO: Need parametric type support?
+    //   return ROW();
+    // case cudf::type_id::EMPTY: return TypeKind::OPAQUE;
+    // case cudf::type_id::EMPTY: return TypeKind::UNKNOWN;
+    default:
+      return UNKNOWN();
+  }
+}
+
+// Convert a Velox vector to a CUDF column
+struct copy_to_device {
+  rmm::cuda_stream_view stream;
+
+  // Fixed width types
+  template <
+      typename T,
+      std::enable_if_t<cudf::is_rep_layout_compatible<T>()>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
+    VELOX_CHECK_NOT_NULL(h_vec);
+    using velox_T = cudf_storage_type_t<T>;
+    if (cudfDebugEnabled()) {
+      std::cout << "Converting fixed width column" << std::endl;
+      std::cout << "Encoding: " << h_vec->encoding() << std::endl;
+      std::cout << "Type: " << h_vec->type()->toString() << std::endl;
+      std::cout << "velox_T: " << typeid(velox_T{}).name() << std::endl;
+    }
+    auto velox_data = h_vec->as<FlatVector<velox_T>>();
+    VELOX_CHECK_NOT_NULL(velox_data);
+    auto velox_data_ptr = velox_data->rawValues();
+    cudf::host_span<const velox_T> velox_host_span(
+        velox_data_ptr, int{h_vec->size()});
+    auto d_v = cudf::detail::make_device_uvector_sync(
+        velox_host_span, stream, rmm::mr::get_current_device_resource());
+    return std::make_unique<cudf::column>(
+        std::move(d_v), rmm::device_buffer{}, 0);
+  }
+
+  // Strings
+  template <
+      typename T,
+      std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
+    if (cudfDebugEnabled()) {
+      std::cout << "Converting string column" << std::endl;
+    }
+
+    auto const num_rows = h_vec->size();
+    auto h_offsets = std::vector<int64_t>(num_rows + 1);
+    h_offsets[0] = 0;
+    auto make_offsets = [&](auto const& vec) {
+      VELOX_CHECK_NOT_NULL(vec);
+      if (cudfDebugEnabled()) {
+        std::cout << "Starting offset calculation" << std::endl;
+      }
+      for (auto i = 0; i < num_rows; i++) {
+        h_offsets[i + 1] = h_offsets[i] + vec->valueAt(i).size();
+      }
+    };
+    vector_encoding_dispatcher<StringView>(h_vec, make_offsets);
+
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+        h_offsets, stream, rmm::mr::get_current_device_resource());
+
+    auto chars_size = h_offsets[num_rows];
+    auto h_chars = std::vector<char>(chars_size);
+
+    auto make_chars = [&](auto vec) {
+      VELOX_CHECK_NOT_NULL(vec);
+      for (auto i = 0; i < num_rows; i++) {
+        auto const string_view = vec->valueAt(i);
+        auto const size = string_view.size();
+        auto const offset = h_offsets[i];
+        std::copy(
+            string_view.data(),
+            string_view.data() + size,
+            h_chars.begin() + offset);
+      }
+    };
+    vector_encoding_dispatcher<StringView>(h_vec, make_chars);
+
+    auto d_chars = cudf::detail::make_device_uvector_sync(
+        h_chars, stream, rmm::mr::get_current_device_resource());
+
+    return cudf::make_strings_column(
+        num_rows,
+        std::make_unique<cudf::column>(
+            std::move(d_offsets), rmm::device_buffer{}, 0),
+        d_chars.release(),
+        0,
+        rmm::device_buffer{});
+  }
+
+  template <
+      typename T,
+      typename... Args,
+      std::enable_if_t<
+          not(cudf::is_rep_layout_compatible<T>() or
+              std::is_same_v<T, cudf::string_view>)>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
+    if (cudfDebugEnabled()) {
+      std::string error_message = "Unsupported type for to_cudf conversion: ";
+      error_message += h_vec->type()->toString();
+      std::cout << error_message << std::endl;
+    }
+    CUDF_FAIL("Unsupported type for to_cudf conversion");
+  }
+};
+
+// Row vector to table
+// Vector to column
+// template<bool is_table=true>
+std::unique_ptr<cudf::table> to_cudf_table(const RowVectorPtr& leftBatch) {
+  VELOX_NVTX_FUNC_RANGE();
+  // cudf type dispatcher to copy data from velox vector to cudf column
+  using cudf_col_ptr = std::unique_ptr<cudf::column>;
+  std::vector<cudf_col_ptr> cudf_columns;
+  auto copier = copy_to_device{cudf::get_default_stream()};
+  for (auto const& h_vec : leftBatch->children()) {
+    auto cudf_kind = cudf::data_type{velox_to_cudf_type_id(h_vec->type())};
+    auto cudf_column = cudf::type_dispatcher(cudf_kind, copier, h_vec);
+    cudf_columns.push_back(std::move(cudf_column));
+  }
+  return std::make_unique<cudf::table>(std::move(cudf_columns));
+}
+
+// Convert a CUDF column to a Velox vector
+struct copy_to_host {
+  rmm::cuda_stream_view stream;
+  memory::MemoryPool* pool_;
+
+  template <typename T>
+  static constexpr bool is_supported() {
+    // return cudf::is_rep_layout_compatible<T>();
+    return cudf::is_numeric<T>() and not std::is_same<T, bool>::value;
+  }
+
+  // Fixed width types
+  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
+  VectorPtr operator()(TypePtr velox_type, cudf::column_view const& col) const {
+    auto velox_buffer = AlignedBuffer::allocate<T>(col.size(), pool_);
+    auto velox_col = std::make_shared<FlatVector<T>>(
+        pool_,
+        velox_type,
+        nullptr,
+        col.size(),
+        velox_buffer,
+        std::vector<BufferPtr>{});
+    auto velox_data_ptr = velox_col->mutableRawValues();
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+        velox_data_ptr,
+        col.data<T>(),
+        col.size() * sizeof(T),
+        cudaMemcpyDefault,
+        stream.value()));
+    stream.synchronize();
+    return velox_col;
+  }
+
+  template <
+      typename T,
+      typename... Args,
+      std::enable_if_t<not is_supported<T>()>* = nullptr>
+  VectorPtr operator()(Args... args) const {
+    CUDF_FAIL("Unsupported type for to_velox conversion");
+  }
+};
+
+VectorPtr to_velox_column(
+    const cudf::column_view& col,
+    memory::MemoryPool* pool) {
+  VELOX_NVTX_PRETTY_FUNC_RANGE();
+  auto velox_type = cudf_type_id_to_velox_type(col.type().id());
+  if (cudfDebugEnabled()) {
+    std::cout << "Converting to_velox_column: " << velox_type->toString()
+              << std::endl;
+  }
+  // cudf type dispatcher to copy data from cudf column to velox vector
+  auto copier = copy_to_host{cudf::get_default_stream(), pool};
+  return cudf::type_dispatcher(col.type(), copier, velox_type, col);
+}
+
+RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    memory::MemoryPool* pool,
+    std::string name_prefix) {
+  VELOX_NVTX_PRETTY_FUNC_RANGE();
+  std::vector<VectorPtr> children;
+  std::vector<std::string> childNames;
+  std::vector<std::shared_ptr<const Type>> childTypes;
+  children.reserve(table.num_columns());
+  childNames.reserve(table.num_columns());
+  for (auto& col : table) {
+    children.push_back(to_velox_column(col, pool));
+    childNames.push_back(name_prefix + std::to_string(childNames.size()));
+  }
+
+  childTypes.reserve(children.size());
+  for (const auto& child : children) {
+    childTypes.push_back(child->type());
+  }
+  auto rowType = ROW(std::move(childNames), std::move(childTypes));
+  const size_t vectorSize = children.empty() ? 0 : children.front()->size();
+
+  return std::make_shared<RowVector>(
+      pool, rowType, BufferPtr(nullptr), vectorSize, children);
+}
+
+namespace with_arrow {
+
+std::unique_ptr<cudf::table> to_cudf_table(
+    const facebook::velox::RowVectorPtr& veloxTable, // BaseVector or RowVector?
+    facebook::velox::memory::MemoryPool* pool,
+    rmm::cuda_stream_view stream) {
+  // Need to flattenDictionary and flattenConstant, otherwise we observe issues
+  // in the null mask.
+  ArrowOptions arrowOptions{true, true};
+  ArrowArray arrowArray;
+  exportToArrow(
+      std::dynamic_pointer_cast<facebook::velox::BaseVector>(veloxTable),
+      arrowArray,
+      pool,
+      arrowOptions);
+  ArrowSchema arrowSchema;
+  exportToArrow(
+      std::dynamic_pointer_cast<facebook::velox::BaseVector>(veloxTable),
+      arrowSchema,
+      arrowOptions);
+  auto tbl = cudf::from_arrow(&arrowSchema, &arrowArray, stream);
+
+  // Release Arrow resources
+  if (arrowArray.release) {
+    arrowArray.release(&arrowArray);
+  }
+  if (arrowSchema.release) {
+    arrowSchema.release(&arrowSchema);
+  }
+  return tbl;
+}
+
+namespace {
+
+void to_signed_int_format(char* format) {
+  VELOX_CHECK_NOT_NULL(format);
+  switch (format[0]) {
+    case 'C':
+      format[0] = 'c';
+      break;
+    case 'S':
+      format[0] = 's';
+      break;
+    case 'I':
+      format[0] = 'i';
+      break;
+    case 'L':
+      format[0] = 'l';
+      break;
+    default:
+      return;
+  }
+  printf(
+      "Warning: arrowSchema.format: %s, unsigned is treated as signed indices\n",
+      format);
+}
+
+// Changes all unsigned indices to signed indices for dictionary columns from
+// cudf which uses unsigned indices, but velox uses signed indices.
+void fix_dictionary_indices(ArrowSchema& arrowSchema) {
+  if (arrowSchema.dictionary != nullptr) {
+    to_signed_int_format(const_cast<char*>(arrowSchema.format));
+    fix_dictionary_indices(*arrowSchema.dictionary);
+  }
+  for (size_t i = 0; i < arrowSchema.n_children; ++i) {
+    VELOX_CHECK_NOT_NULL(arrowSchema.children[i]);
+    fix_dictionary_indices(*arrowSchema.children[i]);
+  }
+}
+
+RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    memory::MemoryPool* pool,
+    const std::vector<cudf::column_metadata>& metadata,
+    rmm::cuda_stream_view stream) {
+  auto arrowDeviceArray = cudf::to_arrow_host(table, stream);
+  auto& arrowArray = arrowDeviceArray->array;
+
+  auto arrowSchema = cudf::to_arrow_schema(table, metadata);
+  // Hack to convert unsigned indices to signed indices for dictionary columns
+  fix_dictionary_indices(*arrowSchema);
+
+  auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool);
+  // BaseVector to RowVector
+  auto casted_ptr =
+      std::dynamic_pointer_cast<facebook::velox::RowVector>(veloxTable);
+  VELOX_CHECK_NOT_NULL(casted_ptr);
+  return casted_ptr;
+}
+
+template <typename Iterator>
+std::vector<cudf::column_metadata>
+get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) {
+  std::vector<cudf::column_metadata> metadata;
+  int i = 0;
+  for (auto c = begin; c < end; c++) {
+    metadata.push_back(cudf::column_metadata(name_prefix + std::to_string(i)));
+    metadata.back().children_meta = get_metadata(
+        c->child_begin(), c->child_end(), name_prefix + std::to_string(i));
+    i++;
+  }
+  return metadata;
+}
+
+} // namespace
+
+facebook::velox::RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    facebook::velox::memory::MemoryPool* pool,
+    std::string name_prefix,
+    rmm::cuda_stream_view stream) {
+  auto metadata = get_metadata(table.begin(), table.end(), name_prefix);
+  return to_velox_column(table, pool, metadata, stream);
+}
+
+RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    memory::MemoryPool* pool,
+    const std::vector<std::string>& columnNames,
+    rmm::cuda_stream_view stream) {
+  std::vector<cudf::column_metadata> metadata;
+  for (auto name : columnNames) {
+    metadata.emplace_back(cudf::column_metadata(name));
+  }
+  return to_velox_column(table, pool, metadata, stream);
+}
+
+} // namespace with_arrow
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h
new file mode 100644
index 00000000000..4bd88c1a125
--- /dev/null
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/common/memory/Memory.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/ComplexVector.h"
+
+#include <cudf/column/column.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+namespace facebook::velox::cudf_velox {
+
+cudf::type_id velox_to_cudf_type_id(const TypePtr& type);
+TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id);
+
+[[deprecated(
+    "Use with_arrow::to_cudf_table instead")]] std::unique_ptr<cudf::table>
+to_cudf_table(const facebook::velox::RowVectorPtr& leftBatch);
+facebook::velox::VectorPtr to_velox_column(
+    const cudf::column_view& col,
+    facebook::velox::memory::MemoryPool* pool);
+[[deprecated(
+    "Use with_arrow::to_velox_column instead")]] facebook::velox::RowVectorPtr
+to_velox_column(
+    const cudf::table_view& table,
+    facebook::velox::memory::MemoryPool* pool,
+    std::string name_prefix = "c");
+
+namespace with_arrow {
+std::unique_ptr<cudf::table> to_cudf_table(
+    const facebook::velox::RowVectorPtr& veloxTable,
+    facebook::velox::memory::MemoryPool* pool,
+    rmm::cuda_stream_view stream);
+
+facebook::velox::RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    facebook::velox::memory::MemoryPool* pool,
+    std::string name_prefix,
+    rmm::cuda_stream_view stream);
+
+facebook::velox::RowVectorPtr to_velox_column(
+    const cudf::table_view& table,
+    facebook::velox::memory::MemoryPool* pool,
+    const std::vector<std::string>& columnNames,
+    rmm::cuda_stream_view stream);
+} // namespace with_arrow
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/tests/CMakeLists.txt b/velox/experimental/cudf/tests/CMakeLists.txt
new file mode 100644
index 00000000000..1847f00cd6d
--- /dev/null
+++ b/velox/experimental/cudf/tests/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(velox_cudf_order_by_test Main.cpp OrderByTest.cpp)
+
+add_test(
+  NAME velox_cudf_order_by_test
+  COMMAND velox_cudf_order_by_test
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+set_tests_properties(velox_cudf_order_by_test PROPERTIES LABELS cuda_driver
+                                                         TIMEOUT 3000)
+
+target_link_libraries(
+  velox_cudf_order_by_test
+  velox_cudf_exec
+  velox_exec
+  velox_exec_test_lib
+  velox_test_util
+  gtest
+  gtest_main
+  fmt::fmt)
diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp
new file mode 100644
index 00000000000..ff47198a3a8
--- /dev/null
+++ b/velox/experimental/cudf/tests/OrderByTest.cpp
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <re2/re2.h>
+
+#include <fmt/format.h>
+#include "velox/common/base/tests/GTestUtils.h"
+#include "velox/core/QueryConfig.h"
+#include "velox/dwio/common/tests/utils/BatchMaker.h"
+#include "velox/exec/PlanNodeStats.h"
+#include "velox/exec/tests/utils/AssertQueryBuilder.h"
+#include "velox/exec/tests/utils/OperatorTestBase.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/experimental/cudf/exec/ToCudf.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::exec::test;
+using namespace facebook::velox::common::testutil;
+
+using facebook::velox::test::BatchMaker;
+namespace {
+
+class OrderByTest : public OperatorTestBase {
+ protected:
+  void SetUp() override {
+    OperatorTestBase::SetUp();
+    filesystems::registerLocalFileSystem();
+    cudf_velox::registerCudf();
+    rng_.seed(123);
+
+    rowType_ = ROW(
+        {{"c0", INTEGER()},
+         {"c1", INTEGER()},
+         {"c2", VARCHAR()},
+         {"c3", VARCHAR()}});
+  }
+
+  void TearDown() override {
+    cudf_velox::unregisterCudf();
+    OperatorTestBase::TearDown();
+  }
+
+  void testSingleKey(
+      const std::vector<RowVectorPtr>& input,
+      const std::string& key) {
+    core::PlanNodeId orderById;
+    auto keyIndex = input[0]->type()->asRow().getChildIdx(key);
+    auto plan = PlanBuilder()
+                    .values(input)
+                    .orderBy({fmt::format("{} ASC NULLS LAST", key)}, false)
+                    .capturePlanNodeId(orderById)
+                    .planNode();
+    runTest(
+        plan,
+        orderById,
+        fmt::format("SELECT * FROM tmp ORDER BY {} NULLS LAST", key),
+        {keyIndex});
+
+    plan = PlanBuilder()
+               .values(input)
+               .orderBy({fmt::format("{} DESC NULLS FIRST", key)}, false)
+               .planNode();
+    runTest(
+        plan,
+        orderById,
+        fmt::format("SELECT * FROM tmp ORDER BY {} DESC NULLS FIRST", key),
+        {keyIndex});
+  }
+
+  void testSingleKey(
+      const std::vector<RowVectorPtr>& input,
+      const std::string& key,
+      const std::string& filter) {
+    core::PlanNodeId orderById;
+    auto keyIndex = input[0]->type()->asRow().getChildIdx(key);
+    auto plan = PlanBuilder()
+                    .values(input)
+                    .filter(filter)
+                    .orderBy({fmt::format("{} ASC NULLS LAST", key)}, false)
+                    .capturePlanNodeId(orderById)
+                    .planNode();
+    runTest(
+        plan,
+        orderById,
+        fmt::format(
+            "SELECT * FROM tmp WHERE {} ORDER BY {} NULLS LAST", filter, key),
+        {keyIndex});
+
+    plan = PlanBuilder()
+               .values(input)
+               .filter(filter)
+               .orderBy({fmt::format("{} DESC NULLS FIRST", key)}, false)
+               .capturePlanNodeId(orderById)
+               .planNode();
+    runTest(
+        plan,
+        orderById,
+        fmt::format(
+            "SELECT * FROM tmp WHERE {} ORDER BY {} DESC NULLS FIRST",
+            filter,
+            key),
+        {keyIndex});
+  }
+
+  void testTwoKeys(
+      const std::vector<RowVectorPtr>& input,
+      const std::string& key1,
+      const std::string& key2) {
+    auto& rowType = input[0]->type()->asRow();
+    auto keyIndices = {rowType.getChildIdx(key1), rowType.getChildIdx(key2)};
+
+    std::vector<core::SortOrder> sortOrders = {
+        core::kAscNullsLast, core::kDescNullsFirst};
+    std::vector<std::string> sortOrderSqls = {"NULLS LAST", "DESC NULLS FIRST"};
+
+    for (int i = 0; i < sortOrders.size(); i++) {
+      for (int j = 0; j < sortOrders.size(); j++) {
+        core::PlanNodeId orderById;
+        auto plan = PlanBuilder()
+                        .values(input)
+                        .orderBy(
+                            {fmt::format("{} {}", key1, sortOrderSqls[i]),
+                             fmt::format("{} {}", key2, sortOrderSqls[j])},
+                            false)
+                        .capturePlanNodeId(orderById)
+                        .planNode();
+        runTest(
+            plan,
+            orderById,
+            fmt::format(
+                "SELECT * FROM tmp ORDER BY {} {}, {} {}",
+                key1,
+                sortOrderSqls[i],
+                key2,
+                sortOrderSqls[j]),
+            keyIndices);
+      }
+    }
+  }
+
+  void runTest(
+      core::PlanNodePtr planNode,
+      const core::PlanNodeId& orderById,
+      const std::string& duckDbSql,
+      const std::vector<uint32_t>& sortingKeys) {
+    {
+      SCOPED_TRACE("run without spilling");
+      assertQueryOrdered(planNode, duckDbSql, sortingKeys);
+    }
+  }
+
+  std::vector<RowVectorPtr> makeVectors(
+      const RowTypePtr& rowType,
+      int32_t numVectors,
+      int32_t rowsPerVector) {
+    std::vector<RowVectorPtr> vectors;
+    for (int32_t i = 0; i < numVectors; ++i) {
+      auto vector = std::dynamic_pointer_cast<RowVector>(
+          facebook::velox::test::BatchMaker::createBatch(
+              rowType, rowsPerVector, *pool_));
+      vectors.push_back(vector);
+    }
+    return vectors;
+  }
+
+  folly::Random::DefaultGenerator rng_;
+  RowTypePtr rowType_;
+};
+
+TEST_F(OrderByTest, selectiveFilter) {
+  vector_size_t batchSize = 1000;
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 3; ++i) {
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize,
+        [&](vector_size_t row) { return batchSize * i + row; },
+        nullEvery(5));
+    auto c1 = makeFlatVector<int64_t>(
+        batchSize, [&](vector_size_t row) { return row; }, nullEvery(5));
+    auto c2 = makeFlatVector<double>(
+        batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11));
+    vectors.push_back(makeRowVector({c0, c1, c2}));
+  }
+  createDuckDbTable(vectors);
+
+  // c0 values are unique across batches
+  testSingleKey(vectors, "c0", "c0 % 333 = 0");
+
+  // c1 values are unique only within a batch
+  testSingleKey(vectors, "c1", "c1 % 333 = 0");
+}
+
+TEST_F(OrderByTest, singleKey) {
+  vector_size_t batchSize = 1000;
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 2; ++i) {
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize, [&](vector_size_t row) { return row; }, nullEvery(5));
+    auto c1 = makeFlatVector<double>(
+        batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11));
+    vectors.push_back(makeRowVector({c0, c1}));
+  }
+  createDuckDbTable(vectors);
+
+  testSingleKey(vectors, "c0");
+
+  // parser doesn't support "is not null" expression, hence, using c0 % 2 >= 0
+  testSingleKey(vectors, "c0", "c0 % 2 >= 0");
+
+  core::PlanNodeId orderById;
+  auto plan = PlanBuilder()
+                  .values(vectors)
+                  .orderBy({"c0 DESC NULLS LAST"}, false)
+                  .capturePlanNodeId(orderById)
+                  .planNode();
+  runTest(
+      plan, orderById, "SELECT * FROM tmp ORDER BY c0 DESC NULLS LAST", {0});
+
+  plan = PlanBuilder()
+             .values(vectors)
+             .orderBy({"c0 ASC NULLS FIRST"}, false)
+             .capturePlanNodeId(orderById)
+             .planNode();
+  runTest(plan, orderById, "SELECT * FROM tmp ORDER BY c0 NULLS FIRST", {0});
+}
+
+TEST_F(OrderByTest, multipleKeys) {
+  vector_size_t batchSize = 1000;
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 2; ++i) {
+    // c0: half of rows are null, a quarter is 0 and remaining quarter is 1
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize, [](vector_size_t row) { return row % 4; }, nullEvery(2, 1));
+    auto c1 = makeFlatVector<int32_t>(
+        batchSize, [](vector_size_t row) { return row; }, nullEvery(7));
+    auto c2 = makeFlatVector<double>(
+        batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11));
+    vectors.push_back(makeRowVector({c0, c1, c2}));
+  }
+  createDuckDbTable(vectors);
+
+  testTwoKeys(vectors, "c0", "c1");
+
+  core::PlanNodeId orderById;
+  auto plan = PlanBuilder()
+                  .values(vectors)
+                  .orderBy({"c0 ASC NULLS FIRST", "c1 ASC NULLS LAST"}, false)
+                  .capturePlanNodeId(orderById)
+                  .planNode();
+  runTest(
+      plan,
+      orderById,
+      "SELECT * FROM tmp ORDER BY c0 NULLS FIRST, c1 NULLS LAST",
+      {0, 1});
+
+  plan = PlanBuilder()
+             .values(vectors)
+             .orderBy({"c0 DESC NULLS LAST", "c1 DESC NULLS FIRST"}, false)
+             .capturePlanNodeId(orderById)
+             .planNode();
+  runTest(
+      plan,
+      orderById,
+      "SELECT * FROM tmp ORDER BY c0 DESC NULLS LAST, c1 DESC NULLS FIRST",
+      {0, 1});
+}
+
+TEST_F(OrderByTest, multiBatchResult) {
+  vector_size_t batchSize = 5000;
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 10; ++i) {
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize,
+        [&](vector_size_t row) { return batchSize * i + row; },
+        nullEvery(5));
+    auto c1 = makeFlatVector<double>(
+        batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11));
+    vectors.push_back(makeRowVector({c0, c1, c1, c1, c1, c1}));
+  }
+  createDuckDbTable(vectors);
+
+  testSingleKey(vectors, "c0");
+}
+
+TEST_F(OrderByTest, varfields) {
+  vector_size_t batchSize = 1000;
+  std::vector<RowVectorPtr> vectors;
+  for (int32_t i = 0; i < 5; ++i) {
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize,
+        [&](vector_size_t row) { return batchSize * i + row; },
+        nullEvery(5));
+    auto c1 = makeFlatVector<double>(
+        batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11));
+    auto c2 = makeFlatVector<StringView>(
+        batchSize,
+        [](vector_size_t row) {
+          return StringView::makeInline(std::to_string(row));
+        },
+        nullEvery(17));
+    // TODO: Add support for array/map in createDuckDbTable and verify
+    // that we can sort by array/map as well.
+    vectors.push_back(makeRowVector({c0, c1, c2}));
+  }
+  createDuckDbTable(vectors);
+
+  testSingleKey(vectors, "c2");
+}
+
+#if 0
+// flattening for scalar types unsupported in arrow!
+TEST_F(OrderByTest, unknown) {
+  vector_size_t size = 1'000;
+  auto vector = makeRowVector({
+      makeFlatVector<int64_t>(size, [](auto row) { return row % 7; }),
+      BaseVector::createNullConstant(UNKNOWN(), size, pool()),
+  });
+
+  // Exclude "UNKNOWN" column as DuckDB doesn't understand UNKNOWN type
+  createDuckDbTable(
+      {makeRowVector({vector->childAt(0)}),
+       makeRowVector({vector->childAt(0)})});
+
+  core::PlanNodeId orderById;
+  auto plan = PlanBuilder()
+                  .values({vector, vector})
+                  .orderBy({"c0 DESC NULLS LAST"}, false)
+                  .capturePlanNodeId(orderById)
+                  .planNode();
+  runTest(
+      plan,
+      orderById,
+      "SELECT *, null FROM tmp ORDER BY c0 DESC NULLS LAST",
+      {0});
+}
+
+/// Verifies output batch rows of OrderBy
+TEST_F(OrderByTest, outputBatchRows) {
+  struct {
+    int numRowsPerBatch;
+    int preferredOutBatchBytes;
+    int maxOutBatchRows;
+    int expectedOutputVectors;
+
+    // TODO: add output size check with spilling enabled
+    std::string debugString() const {
+      return fmt::format(
+          "numRowsPerBatch:{}, preferredOutBatchBytes:{}, maxOutBatchRows:{}, expectedOutputVectors:{}",
+          numRowsPerBatch,
+          preferredOutBatchBytes,
+          maxOutBatchRows,
+          expectedOutputVectors);
+    }
+  } testSettings[] = {
+      {1024, 1, 100, 1024},
+      // estimated size per row is ~2092, set preferredOutBatchBytes to 20920,
+      // so each batch has 10 rows, so it would return 100 batches
+      {1000, 20920, 100, 100},
+      // same as above, but maxOutBatchRows is 1, so it would return 1000
+      // batches
+      {1000, 20920, 1, 1000}};
+
+  for (const auto& testData : testSettings) {
+    SCOPED_TRACE(testData.debugString());
+    const vector_size_t batchSize = testData.numRowsPerBatch;
+    std::vector<RowVectorPtr> rowVectors;
+    auto c0 = makeFlatVector<int64_t>(
+        batchSize, [&](vector_size_t row) { return row; }, nullEvery(5));
+    auto c1 = makeFlatVector<double>(
+        batchSize, [&](vector_size_t row) { return row; }, nullEvery(11));
+    std::vector<VectorPtr> vectors;
+    vectors.push_back(c0);
+    for (int i = 0; i < 256; ++i) {
+      vectors.push_back(c1);
+    }
+    rowVectors.push_back(makeRowVector(vectors));
+    createDuckDbTable(rowVectors);
+
+    core::PlanNodeId orderById;
+    auto plan = PlanBuilder()
+                    .values(rowVectors)
+                    .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false)
+                    .capturePlanNodeId(orderById)
+                    .planNode();
+    auto queryCtx = core::QueryCtx::create(executor_.get());
+    queryCtx->testingOverrideConfigUnsafe(
+        {{core::QueryConfig::kPreferredOutputBatchBytes,
+          std::to_string(testData.preferredOutBatchBytes)},
+         {core::QueryConfig::kMaxOutputBatchRows,
+          std::to_string(testData.maxOutBatchRows)}});
+    CursorParameters params;
+    params.planNode = plan;
+    params.queryCtx = queryCtx;
+    auto task = assertQueryOrdered(
+        params, "SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST", {0});
+    EXPECT_EQ(
+        testData.expectedOutputVectors,
+        toPlanStats(task->taskStats()).at(orderById).outputVectors);
+  }
+}
+#endif
+
+} // namespace
diff --git a/velox/experimental/cudf/vector/CMakeLists.txt b/velox/experimental/cudf/vector/CMakeLists.txt
new file mode 100644
index 00000000000..d26f0b4c7dc
--- /dev/null
+++ b/velox/experimental/cudf/vector/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(velox_cudf_vector CudfVector.cpp)
+
+set_target_properties(
+  velox_cudf_vector
+  PROPERTIES CUDA_ARCHITECTURES native)
+
+target_link_libraries(
+  velox_cudf_vector
+  cudf::cudf
+  velox_exception
+  velox_common_base
+  velox_vector)
diff --git a/velox/experimental/cudf/vector/CudfVector.cpp b/velox/experimental/cudf/vector/CudfVector.cpp
new file mode 100644
index 00000000000..e4a9e845232
--- /dev/null
+++ b/velox/experimental/cudf/vector/CudfVector.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
+namespace facebook::velox::cudf_velox {
+
+} // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/vector/CudfVector.h b/velox/experimental/cudf/vector/CudfVector.h
new file mode 100644
index 00000000000..8ac20f4a3e1
--- /dev/null
+++ b/velox/experimental/cudf/vector/CudfVector.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/buffer/Buffer.h"
+#include "velox/common/memory/MemoryPool.h"
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/TypeAliases.h"
+
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <memory>
+#include <utility>
+
+namespace facebook::velox::cudf_velox {
+
+// Vector class which holds GPU data from cuDF.
+class CudfVector : public RowVector {
+ public:
+  CudfVector(
+      velox::memory::MemoryPool* pool,
+      TypePtr type,
+      vector_size_t size,
+      std::unique_ptr<cudf::table>&& table,
+      rmm::cuda_stream_view stream)
+      : RowVector(
+            pool,
+            std::move(type),
+            BufferPtr(nullptr),
+            size,
+            std::vector<VectorPtr>(),
+            std::nullopt),
+        table_{std::move(table)},
+        stream_{stream} {}
+
+  rmm::cuda_stream_view stream() const {
+    return stream_;
+  }
+
+  cudf::table_view getTableView() const {
+    return table_->view();
+  }
+
+  std::unique_ptr<cudf::table>&& release() {
+    return std::move(table_);
+  }
+
+  cudf::table_view getTableView() const {
+    return table_->view();
+  }
+
+ private:
+  std::unique_ptr<cudf::table> table_;
+  rmm::cuda_stream_view stream_;
+};
+
+using CudfVectorPtr = std::shared_ptr<CudfVector>;
+
+} // namespace facebook::velox::cudf_velox

From 2c03cbf7908227e25b5785e06705fee4ac0d2a63 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 11 Mar 2025 15:04:48 +0000
Subject: [PATCH 02/37] Remove all other operators from ToCudf

---
 velox/experimental/cudf/exec/ToCudf.cpp | 113 +++---------------------
 1 file changed, 11 insertions(+), 102 deletions(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 03ec29b2c95..cfd8f691cae 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -83,64 +83,23 @@ bool CompileState::compile() {
     return *it;
   };
 
-  auto is_filter_project_supported = [](const exec::Operator* op) {
-    if (auto filter_project_op = dynamic_cast<const exec::FilterProject*>(op)) {
-      auto info = filter_project_op->exprsAndProjection();
-      return !info.hasFilter &&
-          ExpressionEvaluator::can_be_evaluated(info.exprs->exprs());
-    }
-    return false;
-  };
-
-  auto is_join_supported = [get_plan_node](const exec::Operator* op) {
-    if (!is_any_of<exec::HashBuild, exec::HashProbe>(op)) {
-      return false;
-    }
-    auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
-        get_plan_node(op->planNodeId()));
-    if (!plan_node) {
-      return false;
-    }
-    if (!plan_node->isInnerJoin()) {
-      return false;
-    }
-    if (plan_node->filter() != nullptr) {
-      return false;
-    }
-    return true;
+  auto is_supported_gpu_operator = [](const exec::Operator* op) {
+    return is_any_of<exec::OrderBy>(op);
   };
 
-  auto is_supported_gpu_operator =
-      [is_filter_project_supported,
-       is_join_supported](const exec::Operator* op) {
-        return is_any_of<
-                   exec::OrderBy,
-                   exec::HashAggregation,
-                   exec::LocalPartition,
-                   exec::LocalExchange>(op) ||
-            is_filter_project_supported(op) || is_join_supported(op);
-      };
-
   std::vector<bool> is_supported_gpu_operators(operators.size());
   std::transform(
       operators.begin(),
       operators.end(),
       is_supported_gpu_operators.begin(),
       is_supported_gpu_operator);
-  auto accepts_gpu_input = [is_filter_project_supported,
-                            is_join_supported](const exec::Operator* op) {
-    return is_any_of<
-               exec::OrderBy,
-               exec::HashAggregation,
-               exec::LocalPartition>(op) ||
-        is_filter_project_supported(op) || is_join_supported(op);
+
+  auto accepts_gpu_input = [](const exec::Operator* op) {
+    return is_any_of<exec::OrderBy>(op);
   };
-  auto produces_gpu_output = [is_filter_project_supported,
-                              is_join_supported](const exec::Operator* op) {
-    return is_any_of<exec::OrderBy, exec::HashAggregation, exec::LocalExchange>(
-               op) ||
-        is_filter_project_supported(op) ||
-        (is_any_of<exec::HashProbe>(op) && is_join_supported(op));
+
+  auto produces_gpu_output = [](const exec::Operator* op) {
+    return is_any_of<exec::OrderBy>(op);
   };
 
   int32_t operatorsOffset = 0;
@@ -166,28 +125,7 @@ bool CompileState::compile() {
       replace_op.back()->initialize();
     }
 
-    // This is used to denote if the current operator is kept or replaced.
-    auto keep_operator = 0;
-    if (is_join_supported(oper)) {
-      if (auto joinBuildOp = dynamic_cast<exec::HashBuild*>(oper)) {
-        auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
-            get_plan_node(joinBuildOp->planNodeId()));
-        VELOX_CHECK(plan_node != nullptr);
-        // From-Velox (optional)
-        replace_op.push_back(
-            std::make_unique<CudfHashJoinBuild>(id, ctx, plan_node));
-        replace_op.back()->initialize();
-      } else if (auto joinProbeOp = dynamic_cast<exec::HashProbe*>(oper)) {
-        auto plan_node = std::dynamic_pointer_cast<const core::HashJoinNode>(
-            get_plan_node(joinProbeOp->planNodeId()));
-        VELOX_CHECK(plan_node != nullptr);
-        // From-Velox (optional)
-        replace_op.push_back(
-            std::make_unique<CudfHashJoinProbe>(id, ctx, plan_node));
-        replace_op.back()->initialize();
-        // To-Velox (optional)
-      }
-    } else if (auto orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
+    if (auto orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
       auto id = orderByOp->operatorId();
       auto plan_node = std::dynamic_pointer_cast<const core::OrderByNode>(
           get_plan_node(orderByOp->planNodeId()));
@@ -196,34 +134,6 @@ bool CompileState::compile() {
       replace_op.push_back(std::make_unique<CudfOrderBy>(id, ctx, plan_node));
       replace_op.back()->initialize();
       // To-velox (optional)
-    } else if (auto hashAggOp = dynamic_cast<exec::HashAggregation*>(oper)) {
-      auto plan_node = std::dynamic_pointer_cast<const core::AggregationNode>(
-          get_plan_node(hashAggOp->planNodeId()));
-      VELOX_CHECK(plan_node != nullptr);
-      replace_op.push_back(
-          std::make_unique<CudfHashAggregation>(id, ctx, plan_node));
-      replace_op.back()->initialize();
-    } else if (is_filter_project_supported(oper)) {
-      auto filterProjectOp = dynamic_cast<exec::FilterProject*>(oper);
-      auto info = filterProjectOp->exprsAndProjection();
-      auto& id_projections = filterProjectOp->identityProjections();
-      auto plan_node = std::dynamic_pointer_cast<const core::ProjectNode>(
-          get_plan_node(filterProjectOp->planNodeId()));
-      // If filter doesn't exist then project should definitely exist so this
-      // should never hit
-      VELOX_CHECK(plan_node != nullptr);
-      replace_op.push_back(std::make_unique<CudfFilterProject>(
-          id, ctx, info, id_projections, nullptr, plan_node));
-      replace_op.back()->initialize();
-    } else if (
-        auto localPartitionOp = dynamic_cast<exec::LocalPartition*>(oper)) {
-      auto plan_node =
-          std::dynamic_pointer_cast<const core::LocalPartitionNode>(
-              get_plan_node(localPartitionOp->planNodeId()));
-      VELOX_CHECK(plan_node != nullptr);
-      replace_op.push_back(
-          std::make_unique<CudfLocalPartition>(id, ctx, plan_node));
-      replace_op.back()->initialize();
     }
 
     if (next_operator_is_not_gpu and produces_gpu_output(oper)) {
@@ -234,11 +144,10 @@ bool CompileState::compile() {
     }
 
     if (not replace_op.empty()) {
-      operatorsOffset +=
-          replace_op.size() - 1 + keep_operator; // Check this "- 1"
+      operatorsOffset += replace_op.size() - 1;
       [[maybe_unused]] auto replaced = driverFactory_.replaceOperators(
           driver_,
-          replacingOperatorIndex + keep_operator,
+          replacingOperatorIndex,
           replacingOperatorIndex + 1,
           std::move(replace_op));
       replacements_made = true;

From e8461321f2c24c00cdb41d66b628970e09328fb1 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 11 Mar 2025 19:00:59 +0000
Subject: [PATCH 03/37] Make it compile

---
 velox/experimental/cudf/CMakeLists.txt      |  1 -
 velox/experimental/cudf/exec/ToCudf.cpp     | 10 -------
 velox/experimental/cudf/tests/Main.cpp      | 29 +++++++++++++++++++++
 velox/experimental/cudf/vector/CudfVector.h |  4 ---
 4 files changed, 29 insertions(+), 15 deletions(-)
 create mode 100644 velox/experimental/cudf/tests/Main.cpp

diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt
index 96fcdb0d557..e2be268915c 100644
--- a/velox/experimental/cudf/CMakeLists.txt
+++ b/velox/experimental/cudf/CMakeLists.txt
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 add_subdirectory(exec)
-add_subdirectory(connectors)
 add_subdirectory(vector)
 
 if(VELOX_BUILD_TESTING)
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index cfd8f691cae..c9dd04c9443 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -23,12 +23,7 @@
 #include "velox/exec/Operator.h"
 #include "velox/exec/OrderBy.h"
 #include "velox/experimental/cudf/exec/CudfConversion.h"
-#include "velox/experimental/cudf/exec/CudfFilterProject.h"
-#include "velox/experimental/cudf/exec/CudfHashAggregation.h"
-#include "velox/experimental/cudf/exec/CudfHashJoin.h"
-#include "velox/experimental/cudf/exec/CudfLocalPartition.h"
 #include "velox/experimental/cudf/exec/CudfOrderBy.h"
-#include "velox/experimental/cudf/exec/ExpressionEvaluator.h"
 #include "velox/experimental/cudf/exec/Utilities.h"
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -232,11 +227,6 @@ void registerCudf() {
   CUDF_FUNC_RANGE();
   cudaFree(0); // to init context.
 
-  if (cudfDebugEnabled()) {
-    std::cout << "Registering CudfHashJoinBridgeTranslator" << std::endl;
-  }
-  exec::Operator::registerOperator(
-      std::make_unique<CudfHashJoinBridgeTranslator>());
   if (cudfDebugEnabled()) {
     std::cout << "Registering cudfDriverAdapter" << std::endl;
   }
diff --git a/velox/experimental/cudf/tests/Main.cpp b/velox/experimental/cudf/tests/Main.cpp
new file mode 100644
index 00000000000..164b6422fe8
--- /dev/null
+++ b/velox/experimental/cudf/tests/Main.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/common/process/ThreadDebugInfo.h"
+
+#include <folly/Unit.h>
+#include <folly/init/Init.h>
+#include <gtest/gtest.h>
+
+// This main is needed for some tests on linux.
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  // Signal handler required for ThreadDebugInfoTest
+  facebook::velox::process::addDefaultFatalSignalHandler();
+  folly::Init init(&argc, &argv, false);
+  return RUN_ALL_TESTS();
+}
diff --git a/velox/experimental/cudf/vector/CudfVector.h b/velox/experimental/cudf/vector/CudfVector.h
index 8ac20f4a3e1..db1590b3c08 100644
--- a/velox/experimental/cudf/vector/CudfVector.h
+++ b/velox/experimental/cudf/vector/CudfVector.h
@@ -59,10 +59,6 @@ class CudfVector : public RowVector {
     return std::move(table_);
   }
 
-  cudf::table_view getTableView() const {
-    return table_->view();
-  }
-
  private:
   std::unique_ptr<cudf::table> table_;
   rmm::cuda_stream_view stream_;

From 5ee96b3ffd5044c2eec519e57e36abeb4078b4aa Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 11 Mar 2025 19:43:13 +0000
Subject: [PATCH 04/37] remove unused interop code

---
 .../cudf/exec/VeloxCudfInterop.cpp            | 324 ------------------
 .../experimental/cudf/exec/VeloxCudfInterop.h |  23 +-
 2 files changed, 2 insertions(+), 345 deletions(-)

diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index 3fd2b046787..bc906ef1b4a 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -53,330 +53,6 @@
 
 namespace facebook::velox::cudf_velox {
 
-namespace {
-
-template <typename Value, typename Functor, typename... Ts>
-constexpr decltype(auto)
-vector_encoding_dispatcher(VectorPtr vec, Functor f, Ts&&... args) {
-  using facebook::velox::VectorEncoding::Simple;
-  switch (vec->encoding()) {
-    case Simple::FLAT:
-      return f(vec->as<FlatVector<Value>>(), std::forward<Ts>(args)...);
-    case Simple::DICTIONARY:
-      return f(vec->as<DictionaryVector<Value>>(), std::forward<Ts>(args)...);
-    default: {
-      if (cudfDebugEnabled()) {
-        std::cout << "Unsupported Velox encoding: " << vec->encoding()
-                  << std::endl;
-      }
-      CUDF_FAIL("Unsupported Velox encoding");
-    }
-  }
-}
-
-// TODO: dispatch other duration/timestamp types!
-template <typename T>
-using cudf_storage_type_t = std::conditional_t<
-    std::is_same_v<cudf::timestamp_D, T>,
-    cudf::timestamp_D::rep,
-    cudf::device_storage_type_t<T>>;
-
-} // namespace
-
-cudf::type_id velox_to_cudf_type_id(const TypePtr& type) {
-  if (cudfDebugEnabled()) {
-    std::cout << "Converting Velox type " << type->toString() << " to cudf"
-              << std::endl;
-  }
-  switch (type->kind()) {
-    case TypeKind::BOOLEAN:
-      return cudf::type_id::BOOL8;
-    case TypeKind::TINYINT:
-      return cudf::type_id::INT8;
-    case TypeKind::SMALLINT:
-      return cudf::type_id::INT16;
-    case TypeKind::INTEGER:
-      // TODO: handle interval types (durations?)
-      // if (type->isIntervalYearMonth()) {
-      //   return cudf::type_id::...;
-      // }
-      if (type->isDate()) {
-        return cudf::type_id::TIMESTAMP_DAYS;
-      }
-      return cudf::type_id::INT32;
-    case TypeKind::BIGINT:
-      return cudf::type_id::INT64;
-    case TypeKind::REAL:
-      return cudf::type_id::FLOAT32;
-    case TypeKind::DOUBLE:
-      return cudf::type_id::FLOAT64;
-    case TypeKind::VARCHAR:
-      return cudf::type_id::STRING;
-    case TypeKind::VARBINARY:
-      return cudf::type_id::STRING;
-    case TypeKind::TIMESTAMP:
-      return cudf::type_id::TIMESTAMP_NANOSECONDS;
-    // case TypeKind::HUGEINT: return cudf::type_id::DURATION_DAYS;
-    // TODO: DATE was converted to a logical type:
-    // https://github.com/facebookincubator/velox/commit/e480f5c03a6c47897ef4488bd56918a89719f908
-    // case TypeKind::DATE: return cudf::type_id::DURATION_DAYS;
-    // case TypeKind::INTERVAL_DAY_TIME: return cudf::type_id::EMPTY;
-    // TODO: Decimals are now logical types:
-    // https://github.com/facebookincubator/velox/commit/73d2f935b55f084d30557c7be94b9768efb8e56f
-    // case TypeKind::SHORT_DECIMAL: return cudf::type_id::DECIMAL64;
-    // case TypeKind::LONG_DECIMAL: return cudf::type_id::DECIMAL128;
-    // case TypeKind::ARRAY: return cudf::type_id::EMPTY;
-    // case TypeKind::MAP: return cudf::type_id::EMPTY;
-    case TypeKind::ROW:
-      return cudf::type_id::STRUCT;
-    // case TypeKind::UNKNOWN: return cudf::type_id::EMPTY;
-    // case TypeKind::FUNCTION: return cudf::type_id::EMPTY;
-    // case TypeKind::OPAQUE: return cudf::type_id::EMPTY;
-    // case TypeKind::INVALID: return cudf::type_id::EMPTY;
-    default:
-      CUDF_FAIL("Unsupported Velox type");
-      return cudf::type_id::EMPTY;
-  }
-}
-
-TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id) {
-  switch (type_id) {
-    case cudf::type_id::BOOL8:
-      return BOOLEAN();
-    case cudf::type_id::INT8:
-      return TINYINT();
-    case cudf::type_id::INT16:
-      return SMALLINT();
-    case cudf::type_id::INT32:
-      return INTEGER();
-    case cudf::type_id::INT64:
-      return BIGINT();
-    case cudf::type_id::FLOAT32:
-      return REAL();
-    case cudf::type_id::FLOAT64:
-      return DOUBLE();
-    case cudf::type_id::STRING:
-      return VARCHAR();
-    case cudf::type_id::TIMESTAMP_DAYS:
-      return DATE();
-    case cudf::type_id::TIMESTAMP_NANOSECONDS:
-      return TIMESTAMP();
-    // TODO: DATE is now a logical type
-    // case cudf::type_id::DURATION_DAYS: return ???;
-    // case cudf::type_id::EMPTY: return TypeKind::INTERVAL_DAY_TIME;
-    // TODO: DECIMAL is now a logical type
-    // case cudf::type_id::DECIMAL64: return TypeKind::SHORT_DECIMAL;
-    // case cudf::type_id::DECIMAL128: return TypeKind::LONG_DECIMAL;
-    // case cudf::type_id::EMPTY: return TypeKind::ARRAY;
-    // case cudf::type_id::EMPTY: return TypeKind::MAP;
-    // case cudf::type_id::STRUCT:
-    //   // TODO: Need parametric type support?
-    //   return ROW();
-    // case cudf::type_id::EMPTY: return TypeKind::OPAQUE;
-    // case cudf::type_id::EMPTY: return TypeKind::UNKNOWN;
-    default:
-      return UNKNOWN();
-  }
-}
-
-// Convert a Velox vector to a CUDF column
-struct copy_to_device {
-  rmm::cuda_stream_view stream;
-
-  // Fixed width types
-  template <
-      typename T,
-      std::enable_if_t<cudf::is_rep_layout_compatible<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
-    VELOX_CHECK_NOT_NULL(h_vec);
-    using velox_T = cudf_storage_type_t<T>;
-    if (cudfDebugEnabled()) {
-      std::cout << "Converting fixed width column" << std::endl;
-      std::cout << "Encoding: " << h_vec->encoding() << std::endl;
-      std::cout << "Type: " << h_vec->type()->toString() << std::endl;
-      std::cout << "velox_T: " << typeid(velox_T{}).name() << std::endl;
-    }
-    auto velox_data = h_vec->as<FlatVector<velox_T>>();
-    VELOX_CHECK_NOT_NULL(velox_data);
-    auto velox_data_ptr = velox_data->rawValues();
-    cudf::host_span<const velox_T> velox_host_span(
-        velox_data_ptr, int{h_vec->size()});
-    auto d_v = cudf::detail::make_device_uvector_sync(
-        velox_host_span, stream, rmm::mr::get_current_device_resource());
-    return std::make_unique<cudf::column>(
-        std::move(d_v), rmm::device_buffer{}, 0);
-  }
-
-  // Strings
-  template <
-      typename T,
-      std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
-    if (cudfDebugEnabled()) {
-      std::cout << "Converting string column" << std::endl;
-    }
-
-    auto const num_rows = h_vec->size();
-    auto h_offsets = std::vector<int64_t>(num_rows + 1);
-    h_offsets[0] = 0;
-    auto make_offsets = [&](auto const& vec) {
-      VELOX_CHECK_NOT_NULL(vec);
-      if (cudfDebugEnabled()) {
-        std::cout << "Starting offset calculation" << std::endl;
-      }
-      for (auto i = 0; i < num_rows; i++) {
-        h_offsets[i + 1] = h_offsets[i] + vec->valueAt(i).size();
-      }
-    };
-    vector_encoding_dispatcher<StringView>(h_vec, make_offsets);
-
-    auto d_offsets = cudf::detail::make_device_uvector_sync(
-        h_offsets, stream, rmm::mr::get_current_device_resource());
-
-    auto chars_size = h_offsets[num_rows];
-    auto h_chars = std::vector<char>(chars_size);
-
-    auto make_chars = [&](auto vec) {
-      VELOX_CHECK_NOT_NULL(vec);
-      for (auto i = 0; i < num_rows; i++) {
-        auto const string_view = vec->valueAt(i);
-        auto const size = string_view.size();
-        auto const offset = h_offsets[i];
-        std::copy(
-            string_view.data(),
-            string_view.data() + size,
-            h_chars.begin() + offset);
-      }
-    };
-    vector_encoding_dispatcher<StringView>(h_vec, make_chars);
-
-    auto d_chars = cudf::detail::make_device_uvector_sync(
-        h_chars, stream, rmm::mr::get_current_device_resource());
-
-    return cudf::make_strings_column(
-        num_rows,
-        std::make_unique<cudf::column>(
-            std::move(d_offsets), rmm::device_buffer{}, 0),
-        d_chars.release(),
-        0,
-        rmm::device_buffer{});
-  }
-
-  template <
-      typename T,
-      typename... Args,
-      std::enable_if_t<
-          not(cudf::is_rep_layout_compatible<T>() or
-              std::is_same_v<T, cudf::string_view>)>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(VectorPtr const& h_vec) const {
-    if (cudfDebugEnabled()) {
-      std::string error_message = "Unsupported type for to_cudf conversion: ";
-      error_message += h_vec->type()->toString();
-      std::cout << error_message << std::endl;
-    }
-    CUDF_FAIL("Unsupported type for to_cudf conversion");
-  }
-};
-
-// Row vector to table
-// Vector to column
-// template<bool is_table=true>
-std::unique_ptr<cudf::table> to_cudf_table(const RowVectorPtr& leftBatch) {
-  VELOX_NVTX_FUNC_RANGE();
-  // cudf type dispatcher to copy data from velox vector to cudf column
-  using cudf_col_ptr = std::unique_ptr<cudf::column>;
-  std::vector<cudf_col_ptr> cudf_columns;
-  auto copier = copy_to_device{cudf::get_default_stream()};
-  for (auto const& h_vec : leftBatch->children()) {
-    auto cudf_kind = cudf::data_type{velox_to_cudf_type_id(h_vec->type())};
-    auto cudf_column = cudf::type_dispatcher(cudf_kind, copier, h_vec);
-    cudf_columns.push_back(std::move(cudf_column));
-  }
-  return std::make_unique<cudf::table>(std::move(cudf_columns));
-}
-
-// Convert a CUDF column to a Velox vector
-struct copy_to_host {
-  rmm::cuda_stream_view stream;
-  memory::MemoryPool* pool_;
-
-  template <typename T>
-  static constexpr bool is_supported() {
-    // return cudf::is_rep_layout_compatible<T>();
-    return cudf::is_numeric<T>() and not std::is_same<T, bool>::value;
-  }
-
-  // Fixed width types
-  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
-  VectorPtr operator()(TypePtr velox_type, cudf::column_view const& col) const {
-    auto velox_buffer = AlignedBuffer::allocate<T>(col.size(), pool_);
-    auto velox_col = std::make_shared<FlatVector<T>>(
-        pool_,
-        velox_type,
-        nullptr,
-        col.size(),
-        velox_buffer,
-        std::vector<BufferPtr>{});
-    auto velox_data_ptr = velox_col->mutableRawValues();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-        velox_data_ptr,
-        col.data<T>(),
-        col.size() * sizeof(T),
-        cudaMemcpyDefault,
-        stream.value()));
-    stream.synchronize();
-    return velox_col;
-  }
-
-  template <
-      typename T,
-      typename... Args,
-      std::enable_if_t<not is_supported<T>()>* = nullptr>
-  VectorPtr operator()(Args... args) const {
-    CUDF_FAIL("Unsupported type for to_velox conversion");
-  }
-};
-
-VectorPtr to_velox_column(
-    const cudf::column_view& col,
-    memory::MemoryPool* pool) {
-  VELOX_NVTX_PRETTY_FUNC_RANGE();
-  auto velox_type = cudf_type_id_to_velox_type(col.type().id());
-  if (cudfDebugEnabled()) {
-    std::cout << "Converting to_velox_column: " << velox_type->toString()
-              << std::endl;
-  }
-  // cudf type dispatcher to copy data from cudf column to velox vector
-  auto copier = copy_to_host{cudf::get_default_stream(), pool};
-  return cudf::type_dispatcher(col.type(), copier, velox_type, col);
-}
-
-RowVectorPtr to_velox_column(
-    const cudf::table_view& table,
-    memory::MemoryPool* pool,
-    std::string name_prefix) {
-  VELOX_NVTX_PRETTY_FUNC_RANGE();
-  std::vector<VectorPtr> children;
-  std::vector<std::string> childNames;
-  std::vector<std::shared_ptr<const Type>> childTypes;
-  children.reserve(table.num_columns());
-  childNames.reserve(table.num_columns());
-  for (auto& col : table) {
-    children.push_back(to_velox_column(col, pool));
-    childNames.push_back(name_prefix + std::to_string(childNames.size()));
-  }
-
-  childTypes.reserve(children.size());
-  for (const auto& child : children) {
-    childTypes.push_back(child->type());
-  }
-  auto rowType = ROW(std::move(childNames), std::move(childTypes));
-  const size_t vectorSize = children.empty() ? 0 : children.front()->size();
-
-  return std::make_shared<RowVector>(
-      pool, rowType, BufferPtr(nullptr), vectorSize, children);
-}
-
 namespace with_arrow {
 
 std::unique_ptr<cudf::table> to_cudf_table(
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h
index 4bd88c1a125..fbb4eda355e 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.h
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h
@@ -24,25 +24,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
-namespace facebook::velox::cudf_velox {
-
-cudf::type_id velox_to_cudf_type_id(const TypePtr& type);
-TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id);
-
-[[deprecated(
-    "Use with_arrow::to_cudf_table instead")]] std::unique_ptr<cudf::table>
-to_cudf_table(const facebook::velox::RowVectorPtr& leftBatch);
-facebook::velox::VectorPtr to_velox_column(
-    const cudf::column_view& col,
-    facebook::velox::memory::MemoryPool* pool);
-[[deprecated(
-    "Use with_arrow::to_velox_column instead")]] facebook::velox::RowVectorPtr
-to_velox_column(
-    const cudf::table_view& table,
-    facebook::velox::memory::MemoryPool* pool,
-    std::string name_prefix = "c");
-
-namespace with_arrow {
+namespace facebook::velox::cudf_velox::with_arrow {
 std::unique_ptr<cudf::table> to_cudf_table(
     const facebook::velox::RowVectorPtr& veloxTable,
     facebook::velox::memory::MemoryPool* pool,
@@ -59,6 +41,5 @@ facebook::velox::RowVectorPtr to_velox_column(
     facebook::velox::memory::MemoryPool* pool,
     const std::vector<std::string>& columnNames,
     rmm::cuda_stream_view stream);
-} // namespace with_arrow
 
-} // namespace facebook::velox::cudf_velox
+} // namespace facebook::velox::cudf_velox::with_arrow

From 46f0a94d600a7e5fef96106c963ba6a503b4d9aa Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 12 Mar 2025 17:07:07 +0000
Subject: [PATCH 05/37] Pin rmm and kvikio

---
 CMake/resolve_dependency_modules/cudf.cmake | 29 +++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake
index a6abdb61254..0c57dbdcf01 100644
--- a/CMake/resolve_dependency_modules/cudf.cmake
+++ b/CMake/resolve_dependency_modules/cudf.cmake
@@ -14,6 +14,22 @@
 
 include_guard(GLOBAL)
 
+set(VELOX_rmm_VERSION 25.04)
+set(VELOX_rmm_BUILD_SHA256_CHECKSUM
+    294905094213a2d1fd8e024500359ff871bc52f913a3fbaca3514727c49f62de)
+set(VELOX_rmm_SOURCE_URL
+    "https://github.com/rapidsai/rmm/archive/d8b7dacdeda302d2e37313c02d14ef5e1d1e98ea.tar.gz"
+)
+velox_resolve_dependency_url(rmm)
+
+set(VELOX_kvikio_VERSION 25.04)
+set(VELOX_kvikio_BUILD_SHA256_CHECKSUM
+    4a0b15295d0a397433930bf9a309e4ad2361b25dc7a7b3e6a35d0c9419d0cb62)
+set(VELOX_kvikio_SOURCE_URL
+    "https://github.com/rapidsai/kvikio/archive/5c710f37236bda76e447e929e17b1efbc6c632c3.tar.gz"
+)
+velox_resolve_dependency_url(kvikio)
+
 set(VELOX_cudf_VERSION 25.04)
 set(VELOX_cudf_BUILD_SHA256_CHECKSUM
     e5a1900dfaf23dab2c5808afa17a2d04fa867d2892ecec1cb37908f3b73715c2)
@@ -36,6 +52,19 @@ string(
   APPEND CMAKE_CXX_FLAGS
   " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy")
 
+FetchContent_Declare(
+  rmm
+  URL ${VELOX_rmm_SOURCE_URL}
+  URL_HASH ${VELOX_rmm_BUILD_SHA256_CHECKSUM}
+  UPDATE_DISCONNECTED 1)
+
+FetchContent_Declare(
+  kvikio
+  URL ${VELOX_kvikio_SOURCE_URL}
+  URL_HASH ${VELOX_kvikio_BUILD_SHA256_CHECKSUM}
+  SOURCE_SUBDIR cpp
+  UPDATE_DISCONNECTED 1)
+
 FetchContent_Declare(
   cudf
   URL ${VELOX_cudf_SOURCE_URL}

From 47913ae957634e7a767c7ace145a91957b8261f4 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 12 Mar 2025 18:31:58 +0000
Subject: [PATCH 06/37] Remove manually adding fmt

---
 CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 173a40fe1d3..80c13aa430a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -378,11 +378,6 @@ endif()
 
 message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
 
-if(NOT TARGET fmt::fmt)
-  velox_set_source(fmt)
-  velox_resolve_dependency(fmt 9.0.0)
-endif()
-
 if(VELOX_ENABLE_GPU)
   enable_language(CUDA)
   # Determine CUDA_ARCHITECTURES automatically.

From 5effbf2099f62ac127ee6a9e4afe5aac766ecc54 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 12 Mar 2025 18:32:52 +0000
Subject: [PATCH 07/37] Remove debug prints

---
 .../experimental/cudf/exec/CudfConversion.cpp | 13 -----
 velox/experimental/cudf/exec/CudfOrderBy.cpp  |  9 ----
 velox/experimental/cudf/exec/ToCudf.cpp       | 53 -------------------
 velox/experimental/cudf/exec/Utilities.cpp    |  5 --
 velox/experimental/cudf/exec/Utilities.h      |  6 ---
 5 files changed, 86 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index 68b3ac283de..e9d6b161812 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -121,13 +121,6 @@ RowVectorPtr CudfFromVelox::getOutput() {
 
   VELOX_CHECK_NOT_NULL(tbl);
 
-  if (cudfDebugEnabled()) {
-    std::cout << "CudfFromVelox table number of columns: " << tbl->num_columns()
-              << std::endl;
-    std::cout << "CudfFromVelox table number of rows: " << tbl->num_rows()
-              << std::endl;
-  }
-
   // Return a CudfVector that owns the cudf table
   auto const size = tbl->num_rows();
   return std::make_shared<CudfVector>(
@@ -173,12 +166,6 @@ RowVectorPtr CudfToVelox::getOutput() {
   inputs_.pop_front();
 
   VELOX_CHECK_NOT_NULL(tbl);
-  if (cudfDebugEnabled()) {
-    std::cout << "CudfToVelox table number of columns: " << tbl->num_columns()
-              << std::endl;
-    std::cout << "CudfToVelox table number of rows: " << tbl->num_rows()
-              << std::endl;
-  }
   if (tbl->num_rows() == 0) {
     return nullptr;
   }
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index ac32cc8e23f..ad897d3b68f 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -61,9 +61,6 @@ CudfOrderBy::CudfOrderBy(
             ? cudf::null_order::BEFORE
             : cudf::null_order::AFTER);
   }
-  if (cudfDebugEnabled()) {
-    std::cout << "Number of Sort keys: " << sort_keys_.size() << std::endl;
-  }
 }
 
 void CudfOrderBy::addInput(RowVectorPtr input) {
@@ -94,12 +91,6 @@ void CudfOrderBy::noMoreInput() {
   inputs_.clear();
 
   VELOX_CHECK_NOT_NULL(tbl);
-  if (cudfDebugEnabled()) {
-    std::cout << "Sort input table number of columns: " << tbl->num_columns()
-              << std::endl;
-    std::cout << "Sort input table number of rows: " << tbl->num_rows()
-              << std::endl;
-  }
 
   auto keys = tbl->view().select(sort_keys_);
   auto values = tbl->view();
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index c9dd04c9443..2741ee246aa 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -42,25 +42,9 @@ bool is_any_of(const Base* p) {
 static bool _cudfIsRegistered = false;
 
 bool CompileState::compile() {
-  if (cudfDebugEnabled()) {
-    std::cout << "Calling cudfDriverAdapter" << std::endl;
-  }
-
   auto operators = driver_.operators();
   auto& nodes = planNodes_;
 
-  if (cudfDebugEnabled()) {
-    std::cout << "Number of operators: " << operators.size() << std::endl;
-    for (auto& op : operators) {
-      std::cout << "  Operator: ID " << op->operatorId() << ": "
-                << op->toString() << std::endl;
-    }
-    std::cout << "Number of plan nodes: " << nodes.size() << std::endl;
-    for (auto& node : nodes) {
-      std::cout << "  Plan node: ID " << node->id() << ": " << node->toString();
-    }
-  }
-
   // Make sure operator states are initialized.  We will need to inspect some of
   // them during the transformation.
   driver_.initializeOperators();
@@ -149,14 +133,6 @@ bool CompileState::compile() {
     }
   }
 
-  if (cudfDebugEnabled()) {
-    operators = driver_.operators();
-    std::cout << "Number of new operators: " << operators.size() << std::endl;
-    for (auto& op : operators) {
-      std::cout << "  Operator: ID " << op->operatorId() << ": "
-                << op->toString() << std::endl;
-    }
-  }
   return replacements_made;
 }
 
@@ -167,30 +143,14 @@ struct cudfDriverAdapter {
 
   cudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
       : mr_(mr) {
-    if (cudfDebugEnabled()) {
-      std::cout << "cudfDriverAdapter constructor" << std::endl;
-    }
     planNodes_ =
         std::make_shared<std::vector<std::shared_ptr<core::PlanNode const>>>();
   }
 
-  ~cudfDriverAdapter() {
-    if (cudfDebugEnabled()) {
-      std::cout << "cudfDriverAdapter destructor" << std::endl;
-      printf(
-          "cached planNodes_ %p, %ld\n",
-          planNodes_.get(),
-          planNodes_.use_count());
-    }
-  }
-
   // Call operator needed by DriverAdapter
   bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) {
     auto state = CompileState(factory, driver, *planNodes_);
     // Stored planNodes_ from inspect.
-    if (cudfDebugEnabled()) {
-      printf("driver.planNodes_=%p\n", planNodes_.get());
-    }
     auto res = state.compile();
     return res;
   }
@@ -209,9 +169,6 @@ struct cudfDriverAdapter {
     // signature: std::function<void(const core::PlanFragment&)> inspect;
     // call: adapter.inspect(planFragment);
     planNodes_->clear();
-    if (cudfDebugEnabled()) {
-      std::cout << "Inspecting PlanFragment" << std::endl;
-    }
     if (planNodes_) {
       storePlanNodes(planFragment.planNode);
     }
@@ -227,15 +184,8 @@ void registerCudf() {
   CUDF_FUNC_RANGE();
   cudaFree(0); // to init context.
 
-  if (cudfDebugEnabled()) {
-    std::cout << "Registering cudfDriverAdapter" << std::endl;
-  }
-
   const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
   auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async";
-  if (cudfDebugEnabled()) {
-    std::cout << "Setting cuDF memory resource to " << mr_mode << std::endl;
-  }
   auto mr = cudf_velox::create_memory_resource(mr_mode);
   cudf::set_current_device_resource(mr.get());
   cudfDriverAdapter cda{mr};
@@ -245,9 +195,6 @@ void registerCudf() {
 }
 
 void unregisterCudf() {
-  if (cudfDebugEnabled()) {
-    std::cout << "Unregistering cudfDriverAdapter" << std::endl;
-  }
   exec::DriverFactory::adapters.clear();
   _cudfIsRegistered = false;
 }
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
index f12b08a341c..3cd3d4d5b22 100644
--- a/velox/experimental/cudf/exec/Utilities.cpp
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -90,11 +90,6 @@ cudf::detail::cuda_stream_pool& cudfGlobalStreamPool() {
   return cudf::detail::global_cuda_stream_pool();
 };
 
-bool cudfDebugEnabled() {
-  const char* env_cudf_debug = std::getenv("VELOX_CUDF_DEBUG");
-  return env_cudf_debug != nullptr && std::stoi(env_cudf_debug);
-}
-
 std::unique_ptr<cudf::table> concatenateTables(
     std::vector<std::unique_ptr<cudf::table>> tables,
     rmm::cuda_stream_view stream) {
diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h
index 87a496a8cff..73a88490d0f 100644
--- a/velox/experimental/cudf/exec/Utilities.h
+++ b/velox/experimental/cudf/exec/Utilities.h
@@ -38,12 +38,6 @@ create_memory_resource(std::string_view mode);
  */
 [[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool();
 
-/**
- * @brief Returns true if the VELOX_CUDF_DEBUG environment variable is set to a
- * nonzero value.
- */
-bool cudfDebugEnabled();
-
 // Concatenate a vector of cuDF tables into a single table
 std::unique_ptr<cudf::table> concatenateTables(
     std::vector<std::unique_ptr<cudf::table>> tables,

From 0e43fc172dbc2b49f9812e47189e401555ce6681 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 12 Mar 2025 18:45:13 +0000
Subject: [PATCH 08/37] Re-enable some warnings

---
 CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80c13aa430a..d7966304c1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -369,8 +369,6 @@ if(ENABLE_ALL_WARNINGS)
        -Wno-unused-parameter \
        -Wno-sign-compare \
        -Wno-ignored-qualifiers \
-       -Wno-missing-field-initializers \
-       -Wno-deprecated-copy \
        ${KNOWN_COMPILER_SPECIFIC_WARNINGS}")
 
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra ${KNOWN_WARNINGS}")

From 9dc09e1f117597f37f43ef112ced05f2d44d85e6 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 13 Mar 2025 07:35:59 +0000
Subject: [PATCH 09/37] update cmake on centos

---
 scripts/setup-centos9.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/setup-centos9.sh b/scripts/setup-centos9.sh
index 2ce9f869317..6c72f96c9ec 100755
--- a/scripts/setup-centos9.sh
+++ b/scripts/setup-centos9.sh
@@ -68,7 +68,7 @@ function install_build_prerequisites {
   dnf_install ninja-build cmake ccache gcc-toolset-12 git wget which
   dnf_install autoconf automake python3-devel pip libtool
 
-  pip install cmake==3.28.3
+  pip install cmake==3.30.4
 
   if [[ ${USE_CLANG} != "false" ]]; then
     install_clang15

From e0ccfbdc6bb63534c0610e234c82eb7b20d0071b Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 13 Mar 2025 07:38:09 +0000
Subject: [PATCH 10/37] Add our team to codeowners

---
 .github/CODEOWNERS | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c24be384421..649e3694e27 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -32,6 +32,9 @@ scripts/ @assignUser @majetideepak
 # Breeze 
 velox/experimental/breeze @dreveman
 
+# cuDF
+velox/experimental/cudf @bdice @karthikeyann @devavret
+
 # Parquet
 velox/dwio/parquet/ @majetideepak
 

From d00db4f4aacb5da3cecd5dea4143952e89a88564 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 13 Mar 2025 07:46:33 +0000
Subject: [PATCH 11/37] Check off some todos

---
 velox/experimental/cudf/exec/CudfConversion.cpp | 4 ++--
 velox/experimental/cudf/exec/CudfOrderBy.cpp    | 3 ---
 velox/experimental/cudf/exec/CudfOrderBy.h      | 1 -
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index e9d6b161812..9b4f02ad62e 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -130,6 +130,7 @@ RowVectorPtr CudfFromVelox::getOutput() {
 void CudfFromVelox::close() {
   cudf::get_default_stream().synchronize();
   exec::Operator::close();
+  inputs_.clear();
 }
 
 CudfToVelox::CudfToVelox(
@@ -179,8 +180,7 @@ RowVectorPtr CudfToVelox::getOutput() {
 
 void CudfToVelox::close() {
   exec::Operator::close();
-  // TODO: Release stored inputs if needed
-  // TODO: Release cudf memory resources
+  inputs_.clear();
 }
 
 } // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index ad897d3b68f..3667a8b8000 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -41,7 +41,6 @@ CudfOrderBy::CudfOrderBy(
           "CudfOrderBy"),
       NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise
       orderByNode_(orderByNode) {
-  maxOutputRows_ = outputBatchRows(std::nullopt);
   sort_keys_.reserve(orderByNode->sortingKeys().size());
   column_order_.reserve(orderByNode->sortingKeys().size());
   null_order_.reserve(orderByNode->sortingKeys().size());
@@ -74,8 +73,6 @@ void CudfOrderBy::addInput(RowVectorPtr input) {
 
 void CudfOrderBy::noMoreInput() {
   exec::Operator::noMoreInput();
-  // TODO: Get total row count, batch output
-  // maxOutputRows_ = outputBatchRows(total_row_count);
 
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
 
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h
index 28c89cec8e4..1583494e2f3 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.h
+++ b/velox/experimental/cudf/exec/CudfOrderBy.h
@@ -63,7 +63,6 @@ class CudfOrderBy : public exec::Operator, public NvtxHelper {
   std::vector<cudf::order> column_order_;
   std::vector<cudf::null_order> null_order_;
   bool finished_{false};
-  uint32_t maxOutputRows_;
 };
 
 } // namespace facebook::velox::cudf_velox

From 5accb509db8d08c2be56979615bdac39a10928c2 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 13 Mar 2025 09:12:35 +0000
Subject: [PATCH 12/37] remove commented tests

---
 velox/experimental/cudf/tests/OrderByTest.cpp | 96 -------------------
 1 file changed, 96 deletions(-)

diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp
index ff47198a3a8..86ad291a3ff 100644
--- a/velox/experimental/cudf/tests/OrderByTest.cpp
+++ b/velox/experimental/cudf/tests/OrderByTest.cpp
@@ -31,7 +31,6 @@ using namespace facebook::velox::exec;
 using namespace facebook::velox::exec::test;
 using namespace facebook::velox::common::testutil;
 
-using facebook::velox::test::BatchMaker;
 namespace {
 
 class OrderByTest : public OperatorTestBase {
@@ -312,8 +311,6 @@ TEST_F(OrderByTest, varfields) {
           return StringView::makeInline(std::to_string(row));
         },
         nullEvery(17));
-    // TODO: Add support for array/map in createDuckDbTable and verify
-    // that we can sort by array/map as well.
     vectors.push_back(makeRowVector({c0, c1, c2}));
   }
   createDuckDbTable(vectors);
@@ -321,97 +318,4 @@ TEST_F(OrderByTest, varfields) {
   testSingleKey(vectors, "c2");
 }
 
-#if 0
-// flattening for scalar types unsupported in arrow!
-TEST_F(OrderByTest, unknown) {
-  vector_size_t size = 1'000;
-  auto vector = makeRowVector({
-      makeFlatVector<int64_t>(size, [](auto row) { return row % 7; }),
-      BaseVector::createNullConstant(UNKNOWN(), size, pool()),
-  });
-
-  // Exclude "UNKNOWN" column as DuckDB doesn't understand UNKNOWN type
-  createDuckDbTable(
-      {makeRowVector({vector->childAt(0)}),
-       makeRowVector({vector->childAt(0)})});
-
-  core::PlanNodeId orderById;
-  auto plan = PlanBuilder()
-                  .values({vector, vector})
-                  .orderBy({"c0 DESC NULLS LAST"}, false)
-                  .capturePlanNodeId(orderById)
-                  .planNode();
-  runTest(
-      plan,
-      orderById,
-      "SELECT *, null FROM tmp ORDER BY c0 DESC NULLS LAST",
-      {0});
-}
-
-/// Verifies output batch rows of OrderBy
-TEST_F(OrderByTest, outputBatchRows) {
-  struct {
-    int numRowsPerBatch;
-    int preferredOutBatchBytes;
-    int maxOutBatchRows;
-    int expectedOutputVectors;
-
-    // TODO: add output size check with spilling enabled
-    std::string debugString() const {
-      return fmt::format(
-          "numRowsPerBatch:{}, preferredOutBatchBytes:{}, maxOutBatchRows:{}, expectedOutputVectors:{}",
-          numRowsPerBatch,
-          preferredOutBatchBytes,
-          maxOutBatchRows,
-          expectedOutputVectors);
-    }
-  } testSettings[] = {
-      {1024, 1, 100, 1024},
-      // estimated size per row is ~2092, set preferredOutBatchBytes to 20920,
-      // so each batch has 10 rows, so it would return 100 batches
-      {1000, 20920, 100, 100},
-      // same as above, but maxOutBatchRows is 1, so it would return 1000
-      // batches
-      {1000, 20920, 1, 1000}};
-
-  for (const auto& testData : testSettings) {
-    SCOPED_TRACE(testData.debugString());
-    const vector_size_t batchSize = testData.numRowsPerBatch;
-    std::vector<RowVectorPtr> rowVectors;
-    auto c0 = makeFlatVector<int64_t>(
-        batchSize, [&](vector_size_t row) { return row; }, nullEvery(5));
-    auto c1 = makeFlatVector<double>(
-        batchSize, [&](vector_size_t row) { return row; }, nullEvery(11));
-    std::vector<VectorPtr> vectors;
-    vectors.push_back(c0);
-    for (int i = 0; i < 256; ++i) {
-      vectors.push_back(c1);
-    }
-    rowVectors.push_back(makeRowVector(vectors));
-    createDuckDbTable(rowVectors);
-
-    core::PlanNodeId orderById;
-    auto plan = PlanBuilder()
-                    .values(rowVectors)
-                    .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false)
-                    .capturePlanNodeId(orderById)
-                    .planNode();
-    auto queryCtx = core::QueryCtx::create(executor_.get());
-    queryCtx->testingOverrideConfigUnsafe(
-        {{core::QueryConfig::kPreferredOutputBatchBytes,
-          std::to_string(testData.preferredOutBatchBytes)},
-         {core::QueryConfig::kMaxOutputBatchRows,
-          std::to_string(testData.maxOutBatchRows)}});
-    CursorParameters params;
-    params.planNode = plan;
-    params.queryCtx = queryCtx;
-    auto task = assertQueryOrdered(
-        params, "SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST", {0});
-    EXPECT_EQ(
-        testData.expectedOutputVectors,
-        toPlanStats(task->taskStats()).at(orderById).outputVectors);
-  }
-}
-#endif
-
 } // namespace

From 5fe25754128ae8c324f1a87a13ea50111a7c09fc Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 17 Mar 2025 09:48:48 +0000
Subject: [PATCH 13/37] Ignore known warnings just for cudf_exec target

---
 velox/experimental/cudf/exec/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt
index c47d3a5065a..fc6a5f4cf69 100644
--- a/velox/experimental/cudf/exec/CMakeLists.txt
+++ b/velox/experimental/cudf/exec/CMakeLists.txt
@@ -32,3 +32,8 @@ target_link_libraries(
   velox_exception
   velox_common_base
   velox_exec)
+
+target_compile_options(
+  velox_cudf_exec
+  PRIVATE
+  -Wno-missing-field-initializers)

From c9b2c1aeeedb0662a8bc99bd11eb78eeaff38ba4 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 17 Mar 2025 09:51:12 +0000
Subject: [PATCH 14/37] Misc review fixes: - West const - header cleanup -
 nodiscard

---
 .../experimental/cudf/exec/CudfConversion.cpp |  6 ++---
 velox/experimental/cudf/exec/CudfOrderBy.cpp  |  3 ---
 velox/experimental/cudf/exec/CudfOrderBy.h    |  3 ---
 velox/experimental/cudf/exec/Utilities.cpp    | 24 +++++++++----------
 velox/experimental/cudf/exec/Utilities.h      |  4 ++--
 5 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index 9b4f02ad62e..bed0e7e0e9e 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -92,8 +92,8 @@ void CudfFromVelox::addInput(RowVectorPtr input) {
 
 RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  auto const target_output_size = preferred_gpu_batch_size_rows();
-  auto const exit_early = finished_ or
+  const auto target_output_size = preferred_gpu_batch_size_rows();
+  const auto exit_early = finished_ or
       (current_output_size_ < target_output_size and not noMoreInput_) or
       inputs_.empty();
   finished_ = noMoreInput_;
@@ -122,7 +122,7 @@ RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_CHECK_NOT_NULL(tbl);
 
   // Return a CudfVector that owns the cudf table
-  auto const size = tbl->num_rows();
+  const auto size = tbl->num_rows();
   return std::make_shared<CudfVector>(
       input->pool(), outputType_, size, std::move(tbl), stream);
 }
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index 3667a8b8000..796143ca26c 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "velox/exec/Driver.h"
-#include "velox/exec/Operator.h"
-#include "velox/vector/ComplexVector.h"
 
 #include <cudf/concatenate.hpp>
 #include <cudf/sorting.hpp>
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h
index 1583494e2f3..481be54d16e 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.h
+++ b/velox/experimental/cudf/exec/CudfOrderBy.h
@@ -16,9 +16,6 @@
 
 #pragma once
 
-#include "velox/core/Expressions.h"
-#include "velox/core/PlanNode.h"
-#include "velox/exec/Driver.h"
 #include "velox/exec/Operator.h"
 #include "velox/experimental/cudf/exec/NvtxHelper.h"
 #include "velox/experimental/cudf/vector/CudfVector.h"
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
index 3cd3d4d5b22..19c2e2b9646 100644
--- a/velox/experimental/cudf/exec/Utilities.cpp
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -14,10 +14,6 @@
  * limitations under the License.
  */
 
-#include <cstdlib>
-#include <memory>
-#include <string_view>
-
 #include "velox/experimental/cudf/exec/Utilities.h"
 
 #include <common/base/Exceptions.h>
@@ -36,32 +32,36 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <cstdlib>
+#include <memory>
+#include <string_view>
+
 namespace facebook::velox::cudf_velox {
 
 namespace {
-auto make_cuda_mr() {
+[[nodiscard]] auto make_cuda_mr() {
   return std::make_shared<rmm::mr::cuda_memory_resource>();
 }
 
-auto make_pool_mr() {
+[[nodiscard]] auto make_pool_mr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
       make_cuda_mr(), rmm::percent_of_free_device_memory(50));
 }
 
-auto make_async_mr() {
+[[nodiscard]] auto make_async_mr() {
   return std::make_shared<rmm::mr::cuda_async_memory_resource>();
 }
 
-auto make_managed_mr() {
+[[nodiscard]] auto make_managed_mr() {
   return std::make_shared<rmm::mr::managed_memory_resource>();
 }
 
-auto make_arena_mr() {
+[[nodiscard]] auto make_arena_mr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
       make_cuda_mr());
 }
 
-auto make_managed_pool_mr() {
+[[nodiscard]] auto make_managed_pool_mr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
       make_managed_mr(), rmm::percent_of_free_device_memory(50));
 }
@@ -105,7 +105,7 @@ std::unique_ptr<cudf::table> concatenateTables(
       tables.begin(),
       tables.end(),
       std::back_inserter(tableViews),
-      [&](auto const& tbl) { return tbl->view(); });
+      [&](const auto& tbl) { return tbl->view(); });
   return cudf::concatenate(
       tableViews, stream, cudf::get_current_device_resource_ref());
 }
@@ -122,7 +122,7 @@ std::unique_ptr<cudf::table> getConcatenatedTable(
   inputStreams.reserve(tables.size());
   tableViews.reserve(tables.size());
 
-  for (auto const& table : tables) {
+  for (const auto& table : tables) {
     VELOX_CHECK_NOT_NULL(table);
     tableViews.push_back(table->getTableView());
     inputStreams.push_back(table->stream());
diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h
index 73a88490d0f..e3480c7d41c 100644
--- a/velox/experimental/cudf/exec/Utilities.h
+++ b/velox/experimental/cudf/exec/Utilities.h
@@ -39,14 +39,14 @@ create_memory_resource(std::string_view mode);
 [[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool();
 
 // Concatenate a vector of cuDF tables into a single table
-std::unique_ptr<cudf::table> concatenateTables(
+[[nodiscard]] std::unique_ptr<cudf::table> concatenateTables(
     std::vector<std::unique_ptr<cudf::table>> tables,
     rmm::cuda_stream_view stream);
 
 // Concatenate a vector of cuDF tables into a single table.
 // This function joins the streams owned by individual tables on the passed
 // stream. Inputs are not safe to use after calling this function.
-std::unique_ptr<cudf::table> getConcatenatedTable(
+[[nodiscard]] std::unique_ptr<cudf::table> getConcatenatedTable(
     std::vector<CudfVectorPtr>& tables,
     rmm::cuda_stream_view stream);
 

From 5a75b9ec5180f56e7f7c417b9c2892d169f4c3b9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 19 Mar 2025 13:09:26 +0000
Subject: [PATCH 15/37] Misc review changes requested by @bdice

---
 CMake/resolve_dependency_modules/cudf.cmake   | 14 ++++++++++
 CMakeLists.txt                                |  6 ++++-
 velox/experimental/cudf/CMakeLists.txt        |  1 -
 velox/experimental/cudf/exec/CMakeLists.txt   |  4 ---
 velox/experimental/cudf/exec/ToCudf.cpp       | 12 ++++++---
 .../cudf/exec/VeloxCudfInterop.cpp            |  2 +-
 velox/experimental/cudf/vector/CMakeLists.txt | 26 -------------------
 velox/experimental/cudf/vector/CudfVector.cpp | 21 ---------------
 8 files changed, 29 insertions(+), 57 deletions(-)
 delete mode 100644 velox/experimental/cudf/vector/CMakeLists.txt
 delete mode 100644 velox/experimental/cudf/vector/CudfVector.cpp

diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake
index 0c57dbdcf01..4c9d015dbf1 100644
--- a/CMake/resolve_dependency_modules/cudf.cmake
+++ b/CMake/resolve_dependency_modules/cudf.cmake
@@ -14,6 +14,14 @@
 
 include_guard(GLOBAL)
 
+set(VELOX_rapids_cmake_VERSION 25.04)
+set(VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM
+    458c14eaff9000067b32d65c8c914f4521090ede7690e16eb57035ce731386db)
+set(VELOX_rapids_cmake_SOURCE_URL
+    "https://github.com/rapidsai/rapids-cmake/archive/7828fc8ff2e9f4fa86099f3c844505c2f47ac672.tar.gz"
+)
+velox_resolve_dependency_url(rapids_cmake)
+
 set(VELOX_rmm_VERSION 25.04)
 set(VELOX_rmm_BUILD_SHA256_CHECKSUM
     294905094213a2d1fd8e024500359ff871bc52f913a3fbaca3514727c49f62de)
@@ -52,6 +60,12 @@ string(
   APPEND CMAKE_CXX_FLAGS
   " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy")
 
+FetchContent_Declare(
+  rapids-cmake
+  URL ${VELOX_rapids_cmake_SOURCE_URL}
+  URL_HASH ${VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM}
+  UPDATE_DISCONNECTED 1)
+
 FetchContent_Declare(
   rmm
   URL ${VELOX_rmm_SOURCE_URL}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7966304c1b..b93de09121f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,8 @@ find_package(OpenSSL REQUIRED)
 
 if(VELOX_ENABLE_CCACHE
    AND NOT CMAKE_C_COMPILER_LAUNCHER
-   AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
+   AND NOT CMAKE_CXX_COMPILER_LAUNCHER
+   AND NOT CMAKE_CUDA_COMPILER_LAUNCHER)
 
   find_program(CCACHE_FOUND ccache)
 
@@ -464,6 +465,9 @@ else()
 endif()
 velox_resolve_dependency(glog)
 
+velox_set_source(fmt)
+velox_resolve_dependency(fmt 9.0.0)
+
 if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR})
   # DWIO needs all sorts of stream compression libraries.
   #
diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt
index e2be268915c..6d400056c35 100644
--- a/velox/experimental/cudf/CMakeLists.txt
+++ b/velox/experimental/cudf/CMakeLists.txt
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 add_subdirectory(exec)
-add_subdirectory(vector)
 
 if(VELOX_BUILD_TESTING)
   add_subdirectory(tests)
diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt
index fc6a5f4cf69..51cd5bde74c 100644
--- a/velox/experimental/cudf/exec/CMakeLists.txt
+++ b/velox/experimental/cudf/exec/CMakeLists.txt
@@ -20,10 +20,6 @@ add_library(
   Utilities.cpp
   VeloxCudfInterop.cpp)
 
-set_target_properties(
-  velox_cudf_exec
-  PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(
   velox_cudf_exec
   cudf::cudf
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 2741ee246aa..957be223efe 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -34,11 +34,15 @@
 
 namespace facebook::velox::cudf_velox {
 
+namespace {
+
 template <class... Deriveds, class Base>
 bool is_any_of(const Base* p) {
   return ((dynamic_cast<const Deriveds*>(p) != nullptr) || ...);
 }
 
+} // namespace
+
 static bool _cudfIsRegistered = false;
 
 bool CompileState::compile() {
@@ -109,10 +113,8 @@ bool CompileState::compile() {
       auto plan_node = std::dynamic_pointer_cast<const core::OrderByNode>(
           get_plan_node(orderByOp->planNodeId()));
       VELOX_CHECK(plan_node != nullptr);
-      // From-velox (optional)
       replace_op.push_back(std::make_unique<CudfOrderBy>(id, ctx, plan_node));
       replace_op.back()->initialize();
-      // To-velox (optional)
     }
 
     if (next_operator_is_not_gpu and produces_gpu_output(oper)) {
@@ -176,13 +178,17 @@ struct cudfDriverAdapter {
 };
 
 void registerCudf() {
+  if (cudfIsRegistered()) {
+    return;
+  }
+
   const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED");
   if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) {
     return;
   }
 
   CUDF_FUNC_RANGE();
-  cudaFree(0); // to init context.
+  cudaFree(0); // Initialize CUDA context at startup
 
   const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
   auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async";
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index bc906ef1b4a..cf0f202da6c 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -56,7 +56,7 @@ namespace facebook::velox::cudf_velox {
 namespace with_arrow {
 
 std::unique_ptr<cudf::table> to_cudf_table(
-    const facebook::velox::RowVectorPtr& veloxTable, // BaseVector or RowVector?
+    const facebook::velox::RowVectorPtr& veloxTable,
     facebook::velox::memory::MemoryPool* pool,
     rmm::cuda_stream_view stream) {
   // Need to flattenDictionary and flattenConstant, otherwise we observe issues
diff --git a/velox/experimental/cudf/vector/CMakeLists.txt b/velox/experimental/cudf/vector/CMakeLists.txt
deleted file mode 100644
index d26f0b4c7dc..00000000000
--- a/velox/experimental/cudf/vector/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_library(velox_cudf_vector CudfVector.cpp)
-
-set_target_properties(
-  velox_cudf_vector
-  PROPERTIES CUDA_ARCHITECTURES native)
-
-target_link_libraries(
-  velox_cudf_vector
-  cudf::cudf
-  velox_exception
-  velox_common_base
-  velox_vector)
diff --git a/velox/experimental/cudf/vector/CudfVector.cpp b/velox/experimental/cudf/vector/CudfVector.cpp
deleted file mode 100644
index e4a9e845232..00000000000
--- a/velox/experimental/cudf/vector/CudfVector.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "velox/experimental/cudf/vector/CudfVector.h"
-
-namespace facebook::velox::cudf_velox {
-
-} // namespace facebook::velox::cudf_velox

From 3f4ca09f07a5414ccc0d6ddf8809ef37963dbb1e Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 19 Mar 2025 13:43:21 +0000
Subject: [PATCH 16/37] Remove only cudf adapter

---
 velox/experimental/cudf/exec/ToCudf.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 957be223efe..90cb5988fef 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -201,7 +201,15 @@ void registerCudf() {
 }
 
 void unregisterCudf() {
-  exec::DriverFactory::adapters.clear();
+  exec::DriverFactory::adapters.erase(
+      std::remove_if(
+          exec::DriverFactory::adapters.begin(),
+          exec::DriverFactory::adapters.end(),
+          [](const exec::DriverAdapter& adapter) {
+            return adapter.label == "cuDF";
+          }),
+      exec::DriverFactory::adapters.end());
+
   _cudfIsRegistered = false;
 }
 

From 6f7d72e38c37ac23a906b4938fa86fc6dc8878e9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 19 Mar 2025 14:14:12 +0000
Subject: [PATCH 17/37] Add clang format to our subdir

---
 velox/experimental/cudf/.clang-format         | 27 +++++++++++++++++++
 .../experimental/cudf/exec/CudfConversion.cpp | 12 ++++-----
 velox/experimental/cudf/exec/CudfConversion.h |  6 ++---
 velox/experimental/cudf/exec/CudfOrderBy.cpp  | 10 +++----
 velox/experimental/cudf/exec/CudfOrderBy.h    |  3 ++-
 velox/experimental/cudf/exec/ToCudf.cpp       |  7 ++---
 velox/experimental/cudf/exec/Utilities.cpp    | 12 ++++-----
 velox/experimental/cudf/exec/Utilities.h      |  7 ++---
 .../cudf/exec/VeloxCudfInterop.cpp            | 14 +++++-----
 velox/experimental/cudf/tests/OrderByTest.cpp |  9 ++++---
 10 files changed, 69 insertions(+), 38 deletions(-)
 create mode 100644 velox/experimental/cudf/.clang-format

diff --git a/velox/experimental/cudf/.clang-format b/velox/experimental/cudf/.clang-format
new file mode 100644
index 00000000000..7b028e6ff68
--- /dev/null
+++ b/velox/experimental/cudf/.clang-format
@@ -0,0 +1,27 @@
+BasedOnStyle: InheritParentConfig
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex:           '^"velox/experimental/' # velox/experimental includes
+    Priority:        0
+  - Regex:           '^"' # quoted includes
+    Priority:        1
+  - Regex:           '^<(benchmarks|tests)/' # benchmark includes
+    Priority:        2
+  - Regex:           '^<cudf_test/' # cuDF includes
+    Priority:        3
+  - Regex:           '^<cudf/' # cuDF includes
+    Priority:        4
+  - Regex:           '^<(nvtext|cudf_kafka)' # other libcudf includes
+    Priority:        5
+  - Regex:           '^<(cugraph|cuml|cuspatial|raft|kvikio)' # Other RAPIDS includes
+    Priority:        6
+  - Regex:           '^<rmm/' # RMM includes
+    Priority:        7
+  - Regex:           '^<(thrust|cub|cuda)/' # CCCL includes
+    Priority:        8
+  - Regex:           '^<(cooperative_groups|cuco|cuda.h|cuda_runtime|device_types|math_constants|nvtx3)' # CUDA includes
+    Priority:        8
+  - Regex:           '^<.*\..*' # other system includes (e.g. with a '.')
+    Priority:        9
+  - Regex:           '^<[^.]+' # STL includes (no '.')
+    Priority:        10
\ No newline at end of file
diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index bed0e7e0e9e..f47f0c239eb 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "velox/experimental/cudf/exec/CudfConversion.h"
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
 #include "velox/exec/Driver.h"
 #include "velox/exec/Operator.h"
 #include "velox/vector/ComplexVector.h"
@@ -21,12 +27,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "velox/experimental/cudf/exec/CudfConversion.h"
-#include "velox/experimental/cudf/exec/NvtxHelper.h"
-#include "velox/experimental/cudf/exec/Utilities.h"
-#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
-#include "velox/experimental/cudf/vector/CudfVector.h"
-
 namespace facebook::velox::cudf_velox {
 
 namespace {
diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
index c75f8464b36..480912256f9 100644
--- a/velox/experimental/cudf/exec/CudfConversion.h
+++ b/velox/experimental/cudf/exec/CudfConversion.h
@@ -16,15 +16,15 @@
 
 #pragma once
 
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/vector/CudfVector.h"
+
 #include "velox/exec/Driver.h"
 #include "velox/exec/Operator.h"
 #include "velox/vector/ComplexVector.h"
 
 #include <cudf/table/table.hpp>
 
-#include "velox/experimental/cudf/exec/NvtxHelper.h"
-#include "velox/experimental/cudf/vector/CudfVector.h"
-
 #include <deque>
 #include <memory>
 #include <vector>
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index 796143ca26c..cb6f34f7dda 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/concatenate.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
 #include "velox/experimental/cudf/exec/CudfOrderBy.h"
 #include "velox/experimental/cudf/exec/NvtxHelper.h"
 #include "velox/experimental/cudf/exec/Utilities.h"
 #include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
 
+#include <cudf/concatenate.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
 namespace facebook::velox::cudf_velox {
 
 CudfOrderBy::CudfOrderBy(
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h
index 481be54d16e..29b225e1e2f 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.h
+++ b/velox/experimental/cudf/exec/CudfOrderBy.h
@@ -16,9 +16,10 @@
 
 #pragma once
 
-#include "velox/exec/Operator.h"
 #include "velox/experimental/cudf/exec/NvtxHelper.h"
 #include "velox/experimental/cudf/vector/CudfVector.h"
+
+#include "velox/exec/Operator.h"
 #include "velox/vector/ComplexVector.h"
 
 #include <cudf/table/table.hpp>
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 90cb5988fef..3be5cfe97fd 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -14,7 +14,11 @@
  * limitations under the License.
  */
 
+#include "velox/experimental/cudf/exec/CudfConversion.h"
+#include "velox/experimental/cudf/exec/CudfOrderBy.h"
 #include "velox/experimental/cudf/exec/ToCudf.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+
 #include "velox/exec/Driver.h"
 #include "velox/exec/FilterProject.h"
 #include "velox/exec/HashAggregation.h"
@@ -22,9 +26,6 @@
 #include "velox/exec/HashProbe.h"
 #include "velox/exec/Operator.h"
 #include "velox/exec/OrderBy.h"
-#include "velox/experimental/cudf/exec/CudfConversion.h"
-#include "velox/experimental/cudf/exec/CudfOrderBy.h"
-#include "velox/experimental/cudf/exec/Utilities.h"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
index 19c2e2b9646..dbf49c65b89 100644
--- a/velox/experimental/cudf/exec/Utilities.cpp
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -16,7 +16,11 @@
 
 #include "velox/experimental/cudf/exec/Utilities.h"
 
-#include <common/base/Exceptions.h>
+#include <cudf/concatenate.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -26,11 +30,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include <cudf/concatenate.hpp>
-#include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
+#include <common/base/Exceptions.h>
 
 #include <cstdlib>
 #include <memory>
diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h
index e3480c7d41c..1e5912359b5 100644
--- a/velox/experimental/cudf/exec/Utilities.h
+++ b/velox/experimental/cudf/exec/Utilities.h
@@ -16,15 +16,16 @@
 
 #pragma once
 
-#include <memory>
-#include <string_view>
-
 #include "velox/experimental/cudf/vector/CudfVector.h"
 
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/table/table.hpp>
+
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <memory>
+#include <string_view>
+
 namespace facebook::velox::cudf_velox {
 
 /**
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index cf0f202da6c..bc86939e996 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include "velox/experimental/cudf/exec/NvtxHelper.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
+#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
+
 #include "velox/common/memory/Memory.h"
 #include "velox/type/Type.h"
 #include "velox/vector/BaseVector.h"
@@ -40,17 +44,13 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
-#include "velox/experimental/cudf/exec/NvtxHelper.h"
-#include "velox/experimental/cudf/exec/Utilities.h"
-#include "velox/experimental/cudf/exec/VeloxCudfInterop.h"
-
-#include <functional>
-#include <numeric>
-
 #include <arrow/c/bridge.h>
 #include <arrow/io/interfaces.h>
 #include <arrow/table.h>
 
+#include <functional>
+#include <numeric>
+
 namespace facebook::velox::cudf_velox {
 
 namespace with_arrow {
diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp
index 86ad291a3ff..d3eb5a75867 100644
--- a/velox/experimental/cudf/tests/OrderByTest.cpp
+++ b/velox/experimental/cudf/tests/OrderByTest.cpp
@@ -13,9 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <re2/re2.h>
+#include "velox/experimental/cudf/exec/ToCudf.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
 
-#include <fmt/format.h>
 #include "velox/common/base/tests/GTestUtils.h"
 #include "velox/core/QueryConfig.h"
 #include "velox/dwio/common/tests/utils/BatchMaker.h"
@@ -23,8 +23,9 @@
 #include "velox/exec/tests/utils/AssertQueryBuilder.h"
 #include "velox/exec/tests/utils/OperatorTestBase.h"
 #include "velox/exec/tests/utils/PlanBuilder.h"
-#include "velox/experimental/cudf/exec/ToCudf.h"
-#include "velox/experimental/cudf/exec/Utilities.h"
+
+#include <fmt/format.h>
+#include <re2/re2.h>
 
 using namespace facebook::velox;
 using namespace facebook::velox::exec;

From 2d679a4e8dac26601b4d72a1e962e26a8070168b Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 20 Mar 2025 17:58:53 +0000
Subject: [PATCH 18/37] Fix style

---
 velox/experimental/cudf/exec/CMakeLists.txt | 5 +----
 velox/experimental/cudf/exec/NvtxHelper.h   | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt
index 51cd5bde74c..430ce71bca6 100644
--- a/velox/experimental/cudf/exec/CMakeLists.txt
+++ b/velox/experimental/cudf/exec/CMakeLists.txt
@@ -29,7 +29,4 @@ target_link_libraries(
   velox_common_base
   velox_exec)
 
-target_compile_options(
-  velox_cudf_exec
-  PRIVATE
-  -Wno-missing-field-initializers)
+target_compile_options(velox_cudf_exec PRIVATE -Wno-missing-field-initializers)
diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h
index 4e4efca7a08..892b1976365 100644
--- a/velox/experimental/cudf/exec/NvtxHelper.h
+++ b/velox/experimental/cudf/exec/NvtxHelper.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <nvtx3/nvtx3.hpp>
+
 #include <optional>
 
 namespace facebook::velox::cudf_velox {

From 757bce934a3f6586f25bfe32eda218c05d0f6a7c Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 21 Mar 2025 08:56:57 +0000
Subject: [PATCH 19/37] Fix naming

---
 .../experimental/cudf/exec/CudfConversion.cpp |  37 ++++---
 velox/experimental/cudf/exec/CudfConversion.h |   2 +-
 velox/experimental/cudf/exec/CudfOrderBy.cpp  |  38 +++----
 velox/experimental/cudf/exec/CudfOrderBy.h    |   6 +-
 velox/experimental/cudf/exec/NvtxHelper.h     |  14 +--
 velox/experimental/cudf/exec/ToCudf.cpp       | 102 +++++++++---------
 velox/experimental/cudf/exec/Utilities.cpp    |  32 +++---
 velox/experimental/cudf/exec/Utilities.h      |   2 +-
 .../cudf/exec/VeloxCudfInterop.cpp            |  42 ++++----
 .../experimental/cudf/exec/VeloxCudfInterop.h |   8 +-
 10 files changed, 141 insertions(+), 142 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index f47f0c239eb..bfe8ff01b26 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -50,13 +50,12 @@ RowVectorPtr mergeRowVectors(
   return copy;
 }
 
-cudf::size_type preferred_gpu_batch_size_rows() {
-  constexpr cudf::size_type default_gpu_batch_size_rows = 100000;
-  const char* env_cudf_gpu_batch_size_rows =
+cudf::size_type preferredGpuBatchSizeRows() {
+  constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000;
+  const char* envCudfGpuBatchSizeRows =
       std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS");
-  return env_cudf_gpu_batch_size_rows != nullptr
-      ? std::stoi(env_cudf_gpu_batch_size_rows)
-      : default_gpu_batch_size_rows;
+  return envCudfGpuBatchSizeRows != nullptr ? std::stoi(envCudfGpuBatchSizeRows)
+                                            : kDefaultGpuBatchSizeRows;
 }
 } // namespace
 
@@ -85,26 +84,26 @@ void CudfFromVelox::addInput(RowVectorPtr input) {
 
       // Accumulate inputs
       inputs_.push_back(input);
-      current_output_size_ += input->size();
+      currentOutputSize_ += input->size();
     }
   }
 }
 
 RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  const auto target_output_size = preferred_gpu_batch_size_rows();
-  const auto exit_early = finished_ or
-      (current_output_size_ < target_output_size and not noMoreInput_) or
+  const auto kTargetOutputSize = preferredGpuBatchSizeRows();
+  const auto kExitEarly = finished_ or
+      (currentOutputSize_ < kTargetOutputSize and not noMoreInput_) or
       inputs_.empty();
   finished_ = noMoreInput_;
-  if (exit_early) {
+  if (kExitEarly) {
     return nullptr;
   }
 
   // Combine all input RowVectors into a single RowVector and clear inputs
   auto input = mergeRowVectors(inputs_, inputs_[0]->pool());
   inputs_.clear();
-  current_output_size_ = 0;
+  currentOutputSize_ = 0;
 
   // Early return if no input
   if (input->size() == 0) {
@@ -115,16 +114,16 @@ RowVectorPtr CudfFromVelox::getOutput() {
   auto stream = cudfGlobalStreamPool().get_stream();
 
   // Convert RowVector to cudf table
-  auto tbl = with_arrow::to_cudf_table(input, input->pool(), stream);
+  auto tbl = with_arrow::toCudfTable(input, input->pool(), stream);
 
   stream.synchronize();
 
   VELOX_CHECK_NOT_NULL(tbl);
 
   // Return a CudfVector that owns the cudf table
-  const auto size = tbl->num_rows();
+  const auto kSize = tbl->num_rows();
   return std::make_shared<CudfVector>(
-      input->pool(), outputType_, size, std::move(tbl), stream);
+      input->pool(), outputType_, kSize, std::move(tbl), stream);
 }
 
 void CudfFromVelox::close() {
@@ -149,9 +148,9 @@ CudfToVelox::CudfToVelox(
 void CudfToVelox::addInput(RowVectorPtr input) {
   // Accumulate inputs
   if (input->size() > 0) {
-    auto cudf_input = std::dynamic_pointer_cast<CudfVector>(input);
-    VELOX_CHECK_NOT_NULL(cudf_input);
-    inputs_.push_back(std::move(cudf_input));
+    auto cudfInput = std::dynamic_pointer_cast<CudfVector>(input);
+    VELOX_CHECK_NOT_NULL(cudfInput);
+    inputs_.push_back(std::move(cudfInput));
   }
 }
 
@@ -171,7 +170,7 @@ RowVectorPtr CudfToVelox::getOutput() {
     return nullptr;
   }
   RowVectorPtr output =
-      with_arrow::to_velox_column(tbl->view(), pool(), "", stream);
+      with_arrow::toVeloxColumn(tbl->view(), pool(), "", stream);
   stream.synchronize();
   finished_ = noMoreInput_ && inputs_.empty();
   output->setType(outputType_);
diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
index 480912256f9..16259aa2f8f 100644
--- a/velox/experimental/cudf/exec/CudfConversion.h
+++ b/velox/experimental/cudf/exec/CudfConversion.h
@@ -59,7 +59,7 @@ class CudfFromVelox : public exec::Operator, public NvtxHelper {
 
  private:
   std::vector<RowVectorPtr> inputs_;
-  std::size_t current_output_size_ = 0;
+  std::size_t currentOutputSize_ = 0;
   bool finished_ = false;
 };
 
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index cb6f34f7dda..c66b535f82a 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -38,22 +38,22 @@ CudfOrderBy::CudfOrderBy(
           "CudfOrderBy"),
       NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise
       orderByNode_(orderByNode) {
-  sort_keys_.reserve(orderByNode->sortingKeys().size());
-  column_order_.reserve(orderByNode->sortingKeys().size());
-  null_order_.reserve(orderByNode->sortingKeys().size());
+  sortKeys_.reserve(orderByNode->sortingKeys().size());
+  columnOrder_.reserve(orderByNode->sortingKeys().size());
+  nullOrder_.reserve(orderByNode->sortingKeys().size());
   for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) {
-    const auto channel =
+    const auto kChannel =
         exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_);
     VELOX_CHECK(
-        channel != kConstantChannel,
+        kChannel != kConstantChannel,
         "OrderBy doesn't allow constant sorting keys");
-    sort_keys_.push_back(channel);
-    auto const& sorting_order = orderByNode->sortingOrders()[i];
-    column_order_.push_back(
-        sorting_order.isAscending() ? cudf::order::ASCENDING
-                                    : cudf::order::DESCENDING);
-    null_order_.push_back(
-        (sorting_order.isNullsFirst() ^ !sorting_order.isAscending())
+    sortKeys_.push_back(kChannel);
+    auto const& sortingOrder = orderByNode->sortingOrders()[i];
+    columnOrder_.push_back(
+        sortingOrder.isAscending() ? cudf::order::ASCENDING
+                                   : cudf::order::DESCENDING);
+    nullOrder_.push_back(
+        (sortingOrder.isNullsFirst() ^ !sortingOrder.isAscending())
             ? cudf::null_order::BEFORE
             : cudf::null_order::AFTER);
   }
@@ -62,9 +62,9 @@ CudfOrderBy::CudfOrderBy(
 void CudfOrderBy::addInput(RowVectorPtr input) {
   // Accumulate inputs
   if (input->size() > 0) {
-    auto cudf_input = std::dynamic_pointer_cast<CudfVector>(input);
-    VELOX_CHECK_NOT_NULL(cudf_input);
-    inputs_.push_back(std::move(cudf_input));
+    auto cudfInput = std::dynamic_pointer_cast<CudfVector>(input);
+    VELOX_CHECK_NOT_NULL(cudfInput);
+    inputs_.push_back(std::move(cudfInput));
   }
 }
 
@@ -86,13 +86,13 @@ void CudfOrderBy::noMoreInput() {
 
   VELOX_CHECK_NOT_NULL(tbl);
 
-  auto keys = tbl->view().select(sort_keys_);
+  auto keys = tbl->view().select(sortKeys_);
   auto values = tbl->view();
   auto result =
-      cudf::sort_by_key(values, keys, column_order_, null_order_, stream);
-  auto const size = result->num_rows();
+      cudf::sort_by_key(values, keys, columnOrder_, nullOrder_, stream);
+  auto const kSize = result->num_rows();
   outputTable_ = std::make_shared<CudfVector>(
-      pool(), outputType_, size, std::move(result), stream);
+      pool(), outputType_, kSize, std::move(result), stream);
 }
 
 RowVectorPtr CudfOrderBy::getOutput() {
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h
index 29b225e1e2f..75e56315746 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.h
+++ b/velox/experimental/cudf/exec/CudfOrderBy.h
@@ -57,9 +57,9 @@ class CudfOrderBy : public exec::Operator, public NvtxHelper {
   CudfVectorPtr outputTable_;
   std::shared_ptr<const core::OrderByNode> orderByNode_;
   std::vector<CudfVectorPtr> inputs_;
-  std::vector<cudf::size_type> sort_keys_;
-  std::vector<cudf::order> column_order_;
-  std::vector<cudf::null_order> null_order_;
+  std::vector<cudf::size_type> sortKeys_;
+  std::vector<cudf::order> columnOrder_;
+  std::vector<cudf::null_order> nullOrder_;
   bool finished_{false};
 };
 
diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h
index 892b1976365..dde348e4744 100644
--- a/velox/experimental/cudf/exec/NvtxHelper.h
+++ b/velox/experimental/cudf/exec/NvtxHelper.h
@@ -35,11 +35,11 @@ class NvtxHelper {
 /**
  * @brief Tag type for Velox's NVTX domain.
  */
-struct velox_domain {
+struct VeloxDomain {
   static constexpr char const* name{"velox"};
 };
 
-using nvtx_registered_string_t = nvtx3::registered_string_in<velox_domain>;
+using NvtxRegisteredStringT = nvtx3::registered_string_in<VeloxDomain>;
 
 #define VELOX_NVTX_OPERATOR_FUNC_RANGE()                                         \
   static_assert(                                                                 \
@@ -47,21 +47,21 @@ using nvtx_registered_string_t = nvtx3::registered_string_in<velox_domain>;
           value,                                                                 \
       "VELOX_NVTX_OPERATOR_FUNC_RANGE can only be used"                          \
       " in Operators derived from NvtxHelper");                                  \
-  static nvtx_registered_string_t const nvtx3_func_name__{                       \
+  static NvtxRegisteredStringT const nvtx3_func_name__{                          \
       std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)};           \
   static ::nvtx3::event_attributes const nvtx3_func_attr__{                    \
       this->payload_.has_value() ?                                             \
           ::nvtx3::event_attributes{nvtx3_func_name__, this->color_,           \
                                    nvtx3::payload{this->payload_.value()}} :   \
           ::nvtx3::event_attributes{nvtx3_func_name__, this->color_}}; \
-  ::nvtx3::scoped_range_in<velox_domain> const nvtx3_range__{nvtx3_func_attr__};
+  ::nvtx3::scoped_range_in<VeloxDomain> const nvtx3_range__{nvtx3_func_attr__};
 
 #define VELOX_NVTX_PRETTY_FUNC_RANGE()                                         \
-  static nvtx_registered_string_t const nvtx3_func_name__{                     \
+  static NvtxRegisteredStringT const nvtx3_func_name__{                        \
       std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)};         \
   static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
-  ::nvtx3::scoped_range_in<velox_domain> const nvtx3_range__{nvtx3_func_attr__};
+  ::nvtx3::scoped_range_in<VeloxDomain> const nvtx3_range__{nvtx3_func_attr__};
 
-#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(velox_domain)
+#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(VeloxDomain)
 
 } // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 3be5cfe97fd..c64886d76db 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -38,14 +38,12 @@ namespace facebook::velox::cudf_velox {
 namespace {
 
 template <class... Deriveds, class Base>
-bool is_any_of(const Base* p) {
+bool isAnyOf(const Base* p) {
   return ((dynamic_cast<const Deriveds*>(p) != nullptr) || ...);
 }
 
 } // namespace
 
-static bool _cudfIsRegistered = false;
-
 bool CompileState::compile() {
   auto operators = driver_.operators();
   auto& nodes = planNodes_;
@@ -54,11 +52,11 @@ bool CompileState::compile() {
   // them during the transformation.
   driver_.initializeOperators();
 
-  bool replacements_made = false;
+  bool replacementsMade = false;
   auto ctx = driver_.driverCtx();
 
   // Get plan node by id lookup.
-  auto get_plan_node = [&](const core::PlanNodeId& id) {
+  auto getPlanNode = [&](const core::PlanNodeId& id) {
     auto it =
         std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) {
           return node->id() == id;
@@ -67,84 +65,84 @@ bool CompileState::compile() {
     return *it;
   };
 
-  auto is_supported_gpu_operator = [](const exec::Operator* op) {
-    return is_any_of<exec::OrderBy>(op);
+  auto isSupportedGpuOperator = [](const exec::Operator* op) {
+    return isAnyOf<exec::OrderBy>(op);
   };
 
-  std::vector<bool> is_supported_gpu_operators(operators.size());
+  std::vector<bool> isSupportedGpuOperators(operators.size());
   std::transform(
       operators.begin(),
       operators.end(),
-      is_supported_gpu_operators.begin(),
-      is_supported_gpu_operator);
+      isSupportedGpuOperators.begin(),
+      isSupportedGpuOperator);
 
-  auto accepts_gpu_input = [](const exec::Operator* op) {
-    return is_any_of<exec::OrderBy>(op);
+  auto acceptsGpuInput = [](const exec::Operator* op) {
+    return isAnyOf<exec::OrderBy>(op);
   };
 
-  auto produces_gpu_output = [](const exec::Operator* op) {
-    return is_any_of<exec::OrderBy>(op);
+  auto producesGpuOutput = [](const exec::Operator* op) {
+    return isAnyOf<exec::OrderBy>(op);
   };
 
   int32_t operatorsOffset = 0;
   for (int32_t operatorIndex = 0; operatorIndex < operators.size();
        ++operatorIndex) {
-    std::vector<std::unique_ptr<exec::Operator>> replace_op;
+    std::vector<std::unique_ptr<exec::Operator>> replaceOp;
 
     exec::Operator* oper = operators[operatorIndex];
     auto replacingOperatorIndex = operatorIndex + operatorsOffset;
     VELOX_CHECK(oper);
 
-    bool const previous_operator_is_not_gpu =
-        (operatorIndex > 0 and !is_supported_gpu_operators[operatorIndex - 1]);
-    bool const next_operator_is_not_gpu =
+    bool const kPreviousOperatorIsNotGpu =
+        (operatorIndex > 0 and !isSupportedGpuOperators[operatorIndex - 1]);
+    bool const kNextOperatorIsNotGpu =
         (operatorIndex < operators.size() - 1 and
-         !is_supported_gpu_operators[operatorIndex + 1]);
+         !isSupportedGpuOperators[operatorIndex + 1]);
 
     auto id = oper->operatorId();
-    if (previous_operator_is_not_gpu and accepts_gpu_input(oper)) {
-      auto plan_node = get_plan_node(oper->planNodeId());
-      replace_op.push_back(std::make_unique<CudfFromVelox>(
-          id, plan_node->outputType(), ctx, plan_node->id() + "-from-velox"));
-      replace_op.back()->initialize();
+    if (kPreviousOperatorIsNotGpu and acceptsGpuInput(oper)) {
+      auto planNode = getPlanNode(oper->planNodeId());
+      replaceOp.push_back(std::make_unique<CudfFromVelox>(
+          id, planNode->outputType(), ctx, planNode->id() + "-from-velox"));
+      replaceOp.back()->initialize();
     }
 
     if (auto orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
       auto id = orderByOp->operatorId();
-      auto plan_node = std::dynamic_pointer_cast<const core::OrderByNode>(
-          get_plan_node(orderByOp->planNodeId()));
-      VELOX_CHECK(plan_node != nullptr);
-      replace_op.push_back(std::make_unique<CudfOrderBy>(id, ctx, plan_node));
-      replace_op.back()->initialize();
+      auto planNode = std::dynamic_pointer_cast<const core::OrderByNode>(
+          getPlanNode(orderByOp->planNodeId()));
+      VELOX_CHECK(planNode != nullptr);
+      replaceOp.push_back(std::make_unique<CudfOrderBy>(id, ctx, planNode));
+      replaceOp.back()->initialize();
     }
 
-    if (next_operator_is_not_gpu and produces_gpu_output(oper)) {
-      auto plan_node = get_plan_node(oper->planNodeId());
-      replace_op.push_back(std::make_unique<CudfToVelox>(
-          id, plan_node->outputType(), ctx, plan_node->id() + "-to-velox"));
-      replace_op.back()->initialize();
+    if (kNextOperatorIsNotGpu and producesGpuOutput(oper)) {
+      auto planNode = getPlanNode(oper->planNodeId());
+      replaceOp.push_back(std::make_unique<CudfToVelox>(
+          id, planNode->outputType(), ctx, planNode->id() + "-to-velox"));
+      replaceOp.back()->initialize();
     }
 
-    if (not replace_op.empty()) {
-      operatorsOffset += replace_op.size() - 1;
+    if (not replaceOp.empty()) {
+      operatorsOffset += replaceOp.size() - 1;
       [[maybe_unused]] auto replaced = driverFactory_.replaceOperators(
           driver_,
           replacingOperatorIndex,
           replacingOperatorIndex + 1,
-          std::move(replace_op));
-      replacements_made = true;
+          std::move(replaceOp));
+      replacementsMade = true;
     }
   }
 
-  return replacements_made;
+  return replacementsMade;
 }
 
-struct cudfDriverAdapter {
+struct CudfDriverAdapter {
   std::shared_ptr<rmm::mr::device_memory_resource> mr_;
   std::shared_ptr<std::vector<std::shared_ptr<core::PlanNode const>>>
       planNodes_;
 
-  cudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
+  CudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
       : mr_(mr) {
     planNodes_ =
         std::make_shared<std::vector<std::shared_ptr<core::PlanNode const>>>();
@@ -178,27 +176,29 @@ struct cudfDriverAdapter {
   }
 };
 
+static bool isCudfRegistered = false;
+
 void registerCudf() {
   if (cudfIsRegistered()) {
     return;
   }
 
-  const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED");
-  if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) {
+  const char* envCudfDisabled = std::getenv("VELOX_CUDF_DISABLED");
+  if (envCudfDisabled != nullptr && std::stoi(envCudfDisabled)) {
     return;
   }
 
   CUDF_FUNC_RANGE();
-  cudaFree(0); // Initialize CUDA context at startup
+  cudaFree(nullptr); // Initialize CUDA context at startup
 
-  const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
-  auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async";
-  auto mr = cudf_velox::create_memory_resource(mr_mode);
+  const char* envCudfMr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
+  auto mrMode = envCudfMr != nullptr ? envCudfMr : "async";
+  auto mr = cudf_velox::createMemoryResource(mrMode);
   cudf::set_current_device_resource(mr.get());
-  cudfDriverAdapter cda{mr};
+  CudfDriverAdapter cda{mr};
   exec::DriverAdapter cudfAdapter{"cuDF", cda, cda};
   exec::DriverFactory::registerAdapter(cudfAdapter);
-  _cudfIsRegistered = true;
+  isCudfRegistered = true;
 }
 
 void unregisterCudf() {
@@ -211,11 +211,11 @@ void unregisterCudf() {
           }),
       exec::DriverFactory::adapters.end());
 
-  _cudfIsRegistered = false;
+  isCudfRegistered = false;
 }
 
 bool cudfIsRegistered() {
-  return _cudfIsRegistered;
+  return isCudfRegistered;
 }
 
 } // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
index dbf49c65b89..99a597f2e3f 100644
--- a/velox/experimental/cudf/exec/Utilities.cpp
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -39,48 +39,48 @@
 namespace facebook::velox::cudf_velox {
 
 namespace {
-[[nodiscard]] auto make_cuda_mr() {
+[[nodiscard]] auto makeCudaMr() {
   return std::make_shared<rmm::mr::cuda_memory_resource>();
 }
 
-[[nodiscard]] auto make_pool_mr() {
+[[nodiscard]] auto makePoolMr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-      make_cuda_mr(), rmm::percent_of_free_device_memory(50));
+      makeCudaMr(), rmm::percent_of_free_device_memory(50));
 }
 
-[[nodiscard]] auto make_async_mr() {
+[[nodiscard]] auto makeAsyncMr() {
   return std::make_shared<rmm::mr::cuda_async_memory_resource>();
 }
 
-[[nodiscard]] auto make_managed_mr() {
+[[nodiscard]] auto makeManagedMr() {
   return std::make_shared<rmm::mr::managed_memory_resource>();
 }
 
-[[nodiscard]] auto make_arena_mr() {
+[[nodiscard]] auto makeArenaMr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-      make_cuda_mr());
+      makeCudaMr());
 }
 
-[[nodiscard]] auto make_managed_pool_mr() {
+[[nodiscard]] auto makeManagedPoolMr() {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-      make_managed_mr(), rmm::percent_of_free_device_memory(50));
+      makeManagedMr(), rmm::percent_of_free_device_memory(50));
 }
 } // namespace
 
-std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+std::shared_ptr<rmm::mr::device_memory_resource> createMemoryResource(
     std::string_view mode) {
   if (mode == "cuda")
-    return make_cuda_mr();
+    return makeCudaMr();
   if (mode == "pool")
-    return make_pool_mr();
+    return makePoolMr();
   if (mode == "async")
-    return make_async_mr();
+    return makeAsyncMr();
   if (mode == "arena")
-    return make_arena_mr();
+    return makeArenaMr();
   if (mode == "managed")
-    return make_managed_mr();
+    return makeManagedMr();
   if (mode == "managed_pool")
-    return make_managed_pool_mr();
+    return makeManagedPoolMr();
   throw cudf::logic_error(
       "Unknown memory resource mode: " + std::string(mode) +
       "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h
index 1e5912359b5..31f529bb34a 100644
--- a/velox/experimental/cudf/exec/Utilities.h
+++ b/velox/experimental/cudf/exec/Utilities.h
@@ -32,7 +32,7 @@ namespace facebook::velox::cudf_velox {
  * @brief Creates a memory resource based on the given mode.
  */
 [[nodiscard]] std::shared_ptr<rmm::mr::device_memory_resource>
-create_memory_resource(std::string_view mode);
+createMemoryResource(std::string_view mode);
 
 /**
  * @brief Returns the global CUDA stream pool used by cudf.
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index bc86939e996..8a2d6a3f46c 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -55,7 +55,7 @@ namespace facebook::velox::cudf_velox {
 
 namespace with_arrow {
 
-std::unique_ptr<cudf::table> to_cudf_table(
+std::unique_ptr<cudf::table> toCudfTable(
     const facebook::velox::RowVectorPtr& veloxTable,
     facebook::velox::memory::MemoryPool* pool,
     rmm::cuda_stream_view stream) {
@@ -87,7 +87,7 @@ std::unique_ptr<cudf::table> to_cudf_table(
 
 namespace {
 
-void to_signed_int_format(char* format) {
+void toSignedIntFormat(char* format) {
   VELOX_CHECK_NOT_NULL(format);
   switch (format[0]) {
     case 'C':
@@ -112,18 +112,18 @@ void to_signed_int_format(char* format) {
 
 // Changes all unsigned indices to signed indices for dictionary columns from
 // cudf which uses unsigned indices, but velox uses signed indices.
-void fix_dictionary_indices(ArrowSchema& arrowSchema) {
+void fixDictionaryIndices(ArrowSchema& arrowSchema) {
   if (arrowSchema.dictionary != nullptr) {
-    to_signed_int_format(const_cast<char*>(arrowSchema.format));
-    fix_dictionary_indices(*arrowSchema.dictionary);
+    toSignedIntFormat(const_cast<char*>(arrowSchema.format));
+    fixDictionaryIndices(*arrowSchema.dictionary);
   }
   for (size_t i = 0; i < arrowSchema.n_children; ++i) {
     VELOX_CHECK_NOT_NULL(arrowSchema.children[i]);
-    fix_dictionary_indices(*arrowSchema.children[i]);
+    fixDictionaryIndices(*arrowSchema.children[i]);
   }
 }
 
-RowVectorPtr to_velox_column(
+RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     memory::MemoryPool* pool,
     const std::vector<cudf::column_metadata>& metadata,
@@ -133,25 +133,25 @@ RowVectorPtr to_velox_column(
 
   auto arrowSchema = cudf::to_arrow_schema(table, metadata);
   // Hack to convert unsigned indices to signed indices for dictionary columns
-  fix_dictionary_indices(*arrowSchema);
+  fixDictionaryIndices(*arrowSchema);
 
   auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool);
   // BaseVector to RowVector
-  auto casted_ptr =
+  auto castedPtr =
       std::dynamic_pointer_cast<facebook::velox::RowVector>(veloxTable);
-  VELOX_CHECK_NOT_NULL(casted_ptr);
-  return casted_ptr;
+  VELOX_CHECK_NOT_NULL(castedPtr);
+  return castedPtr;
 }
 
 template <typename Iterator>
 std::vector<cudf::column_metadata>
-get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) {
+getMetadata(Iterator begin, Iterator end, const std::string& namePrefix) {
   std::vector<cudf::column_metadata> metadata;
   int i = 0;
   for (auto c = begin; c < end; c++) {
-    metadata.push_back(cudf::column_metadata(name_prefix + std::to_string(i)));
-    metadata.back().children_meta = get_metadata(
-        c->child_begin(), c->child_end(), name_prefix + std::to_string(i));
+    metadata.push_back(cudf::column_metadata(namePrefix + std::to_string(i)));
+    metadata.back().children_meta = getMetadata(
+        c->child_begin(), c->child_end(), namePrefix + std::to_string(i));
     i++;
   }
   return metadata;
@@ -159,16 +159,16 @@ get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) {
 
 } // namespace
 
-facebook::velox::RowVectorPtr to_velox_column(
+facebook::velox::RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     facebook::velox::memory::MemoryPool* pool,
-    std::string name_prefix,
+    std::string namePrefix,
     rmm::cuda_stream_view stream) {
-  auto metadata = get_metadata(table.begin(), table.end(), name_prefix);
-  return to_velox_column(table, pool, metadata, stream);
+  auto metadata = getMetadata(table.begin(), table.end(), namePrefix);
+  return toVeloxColumn(table, pool, metadata, stream);
 }
 
-RowVectorPtr to_velox_column(
+RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     memory::MemoryPool* pool,
     const std::vector<std::string>& columnNames,
@@ -177,7 +177,7 @@ RowVectorPtr to_velox_column(
   for (auto name : columnNames) {
     metadata.emplace_back(cudf::column_metadata(name));
   }
-  return to_velox_column(table, pool, metadata, stream);
+  return toVeloxColumn(table, pool, metadata, stream);
 }
 
 } // namespace with_arrow
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h
index fbb4eda355e..529045245f7 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.h
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h
@@ -25,18 +25,18 @@
 #include <cudf/types.hpp>
 
 namespace facebook::velox::cudf_velox::with_arrow {
-std::unique_ptr<cudf::table> to_cudf_table(
+std::unique_ptr<cudf::table> toCudfTable(
     const facebook::velox::RowVectorPtr& veloxTable,
     facebook::velox::memory::MemoryPool* pool,
     rmm::cuda_stream_view stream);
 
-facebook::velox::RowVectorPtr to_velox_column(
+facebook::velox::RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     facebook::velox::memory::MemoryPool* pool,
-    std::string name_prefix,
+    std::string namePrefix,
     rmm::cuda_stream_view stream);
 
-facebook::velox::RowVectorPtr to_velox_column(
+facebook::velox::RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     facebook::velox::memory::MemoryPool* pool,
     const std::vector<std::string>& columnNames,

From 436d72138e22d7c6c90b418d8e707c552844ca0a Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 21 Mar 2025 15:47:26 +0000
Subject: [PATCH 20/37] Fix more style

---
 velox/experimental/cudf/exec/CudfConversion.cpp | 12 ++++++------
 velox/experimental/cudf/exec/CudfOrderBy.cpp    |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index bfe8ff01b26..e0b557d4c2f 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -91,12 +91,12 @@ void CudfFromVelox::addInput(RowVectorPtr input) {
 
 RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  const auto kTargetOutputSize = preferredGpuBatchSizeRows();
-  const auto kExitEarly = finished_ or
-      (currentOutputSize_ < kTargetOutputSize and not noMoreInput_) or
+  const auto targetOutputSize = preferredGpuBatchSizeRows();
+  const auto exitEarly = finished_ or
+      (currentOutputSize_ < targetOutputSize and not noMoreInput_) or
       inputs_.empty();
   finished_ = noMoreInput_;
-  if (kExitEarly) {
+  if (exitEarly) {
     return nullptr;
   }
 
@@ -121,9 +121,9 @@ RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_CHECK_NOT_NULL(tbl);
 
   // Return a CudfVector that owns the cudf table
-  const auto kSize = tbl->num_rows();
+  const auto size = tbl->num_rows();
   return std::make_shared<CudfVector>(
-      input->pool(), outputType_, kSize, std::move(tbl), stream);
+      input->pool(), outputType_, size, std::move(tbl), stream);
 }
 
 void CudfFromVelox::close() {
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index c66b535f82a..061c2037737 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -42,12 +42,12 @@ CudfOrderBy::CudfOrderBy(
   columnOrder_.reserve(orderByNode->sortingKeys().size());
   nullOrder_.reserve(orderByNode->sortingKeys().size());
   for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) {
-    const auto kChannel =
+    const auto channel =
         exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_);
     VELOX_CHECK(
-        kChannel != kConstantChannel,
+        channel != kConstantChannel,
         "OrderBy doesn't allow constant sorting keys");
-    sortKeys_.push_back(kChannel);
+    sortKeys_.push_back(channel);
     auto const& sortingOrder = orderByNode->sortingOrders()[i];
     columnOrder_.push_back(
         sortingOrder.isAscending() ? cudf::order::ASCENDING

From 5538888497d94d68acfecd73cae54c81a88a5a76 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 21 Mar 2025 16:27:35 +0000
Subject: [PATCH 21/37] Error out when cuda architecture is less than 70

---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b93de09121f..b7edb668b3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -390,6 +390,14 @@ if(VELOX_ENABLE_GPU)
   endif()
   find_package(CUDAToolkit REQUIRED)
   if(VELOX_ENABLE_CUDF)
+    foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
+      if(arch LESS 70)
+        message(
+          FATAL_ERROR
+            "CUDA architecture ${arch} is below 70. CUDF requires Volta (SM 70) or newer GPUs."
+        )
+      endif()
+    endforeach()
     set(VELOX_ENABLE_ARROW ON)
     velox_set_source(cudf)
     velox_resolve_dependency(cudf)

From 71624fae3741cb490f3f0ef6151cf3092829ce56 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 25 Mar 2025 08:27:44 +0000
Subject: [PATCH 22/37] Misc. review changes - rename std::shared_ptr<PlanNode
 const> to PlanNodePtr - rename kSize -> size - formalize kCudfAdapterName

---
 velox/experimental/cudf/exec/CudfOrderBy.cpp |  4 ++--
 velox/experimental/cudf/exec/ToCudf.cpp      | 10 ++++------
 velox/experimental/cudf/exec/ToCudf.h        |  6 ++++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
index 061c2037737..e1f2a011147 100644
--- a/velox/experimental/cudf/exec/CudfOrderBy.cpp
+++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -90,9 +90,9 @@ void CudfOrderBy::noMoreInput() {
   auto values = tbl->view();
   auto result =
       cudf::sort_by_key(values, keys, columnOrder_, nullOrder_, stream);
-  auto const kSize = result->num_rows();
+  auto const size = result->num_rows();
   outputTable_ = std::make_shared<CudfVector>(
-      pool(), outputType_, kSize, std::move(result), stream);
+      pool(), outputType_, size, std::move(result), stream);
 }
 
 RowVectorPtr CudfOrderBy::getOutput() {
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index c64886d76db..bbf652b6876 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -139,13 +139,11 @@ bool CompileState::compile() {
 
 struct CudfDriverAdapter {
   std::shared_ptr<rmm::mr::device_memory_resource> mr_;
-  std::shared_ptr<std::vector<std::shared_ptr<core::PlanNode const>>>
-      planNodes_;
+  std::shared_ptr<std::vector<core::PlanNodePtr>> planNodes_;
 
   CudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
       : mr_(mr) {
-    planNodes_ =
-        std::make_shared<std::vector<std::shared_ptr<core::PlanNode const>>>();
+    planNodes_ = std::make_shared<std::vector<core::PlanNodePtr>>();
   }
 
   // Call operator needed by DriverAdapter
@@ -196,7 +194,7 @@ void registerCudf() {
   auto mr = cudf_velox::createMemoryResource(mrMode);
   cudf::set_current_device_resource(mr.get());
   CudfDriverAdapter cda{mr};
-  exec::DriverAdapter cudfAdapter{"cuDF", cda, cda};
+  exec::DriverAdapter cudfAdapter{kCudfAdapterName, cda, cda};
   exec::DriverFactory::registerAdapter(cudfAdapter);
   isCudfRegistered = true;
 }
@@ -207,7 +205,7 @@ void unregisterCudf() {
           exec::DriverFactory::adapters.begin(),
           exec::DriverFactory::adapters.end(),
           [](const exec::DriverAdapter& adapter) {
-            return adapter.label == "cuDF";
+            return adapter.label == kCudfAdapterName;
           }),
       exec::DriverFactory::adapters.end());
 
diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h
index 8da0eba26ae..f19dd0d107f 100644
--- a/velox/experimental/cudf/exec/ToCudf.h
+++ b/velox/experimental/cudf/exec/ToCudf.h
@@ -21,12 +21,14 @@
 
 namespace facebook::velox::cudf_velox {
 
+static const std::string kCudfAdapterName = "cuDF";
+
 class CompileState {
  public:
   CompileState(
       const exec::DriverFactory& driverFactory,
       exec::Driver& driver,
-      std::vector<std::shared_ptr<core::PlanNode const>>& planNodes)
+      std::vector<core::PlanNodePtr>& planNodes)
       : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {}
 
   exec::Driver& driver() {
@@ -39,7 +41,7 @@ class CompileState {
 
   const exec::DriverFactory& driverFactory_;
   exec::Driver& driver_;
-  const std::vector<std::shared_ptr<core::PlanNode const>>& planNodes_;
+  const std::vector<core::PlanNodePtr>& planNodes_;
 };
 
 /// Registers adapter to add cuDF operators to Drivers.

From 6ccdb579b57298aaa5de1b093ff63fa51e70639b Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 25 Mar 2025 08:38:44 +0000
Subject: [PATCH 23/37] Misc review changes

---
 .../experimental/cudf/exec/CudfConversion.cpp | 20 +++++++++----------
 velox/experimental/cudf/exec/ToCudf.cpp       |  2 +-
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index e0b557d4c2f..3de24db7896 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -74,18 +74,16 @@ CudfFromVelox::CudfFromVelox(
 
 void CudfFromVelox::addInput(RowVectorPtr input) {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  if (input != nullptr) {
-    if (input->size() > 0) {
-      // Materialize lazy vectors
-      for (auto& child : input->children()) {
-        child->loadedVector();
-      }
-      input->loadedVector();
-
-      // Accumulate inputs
-      inputs_.push_back(input);
-      currentOutputSize_ += input->size();
+  if (input->size() > 0) {
+    // Materialize lazy vectors
+    for (auto& child : input->children()) {
+      child->loadedVector();
     }
+    input->loadedVector();
+
+    // Accumulate inputs
+    inputs_.push_back(input);
+    currentOutputSize_ += input->size();
   }
 }
 
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index bbf652b6876..1ed61296639 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -155,7 +155,7 @@ struct CudfDriverAdapter {
   }
 
   // Iterate recursively and store them in the planNodes_.
-  void storePlanNodes(const std::shared_ptr<const core::PlanNode>& planNode) {
+  void storePlanNodes(const core::PlanNodePtr& planNode) {
     const auto& sources = planNode->sources();
     for (int32_t i = 0; i < sources.size(); ++i) {
       storePlanNodes(sources[i]);

From ab399f22c8393e3e9a99a60c46fd94e8f2edbb88 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 25 Mar 2025 12:03:08 +0000
Subject: [PATCH 24/37] Prevent merging vectors whose total size exceeds
 vector_size_t max

---
 .../experimental/cudf/exec/CudfConversion.cpp | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index 3de24db7896..f86589611cb 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -36,7 +36,7 @@ RowVectorPtr mergeRowVectors(
     const std::vector<RowVectorPtr>& results,
     velox::memory::MemoryPool* pool) {
   VELOX_NVTX_FUNC_RANGE();
-  auto totalCount = 0;
+  vector_size_t totalCount = 0;
   for (const auto& result : results) {
     totalCount += result->size();
   }
@@ -54,8 +54,15 @@ cudf::size_type preferredGpuBatchSizeRows() {
   constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000;
   const char* envCudfGpuBatchSizeRows =
       std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS");
-  return envCudfGpuBatchSizeRows != nullptr ? std::stoi(envCudfGpuBatchSizeRows)
-                                            : kDefaultGpuBatchSizeRows;
+  const auto batchSize = envCudfGpuBatchSizeRows != nullptr
+      ? std::stoll(envCudfGpuBatchSizeRows)
+      : kDefaultGpuBatchSizeRows;
+  VELOX_CHECK_GT(batchSize, 0, "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be > 0");
+  VELOX_CHECK_LE(
+      batchSize,
+      std::numeric_limits<vector_size_t>::max(),
+      "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be <= max(vector_size_t)");
+  return batchSize;
 }
 } // namespace
 
@@ -90,18 +97,35 @@ void CudfFromVelox::addInput(RowVectorPtr input) {
 RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
   const auto targetOutputSize = preferredGpuBatchSizeRows();
-  const auto exitEarly = finished_ or
+
+  finished_ = noMoreInput_ && inputs_.empty();
+
+  if (finished_ or
       (currentOutputSize_ < targetOutputSize and not noMoreInput_) or
-      inputs_.empty();
-  finished_ = noMoreInput_;
-  if (exitEarly) {
+      inputs_.empty()) {
     return nullptr;
   }
 
-  // Combine all input RowVectors into a single RowVector and clear inputs
-  auto input = mergeRowVectors(inputs_, inputs_[0]->pool());
-  inputs_.clear();
-  currentOutputSize_ = 0;
+  // Select inputs that don't exceed the max vector size limit
+  std::vector<RowVectorPtr> selectedInputs;
+  vector_size_t totalSize = 0;
+  auto const maxVectorSize = std::numeric_limits<vector_size_t>::max();
+
+  for (const auto& input : inputs_) {
+    if (totalSize + input->size() <= maxVectorSize) {
+      selectedInputs.push_back(input);
+      totalSize += input->size();
+    } else {
+      break;
+    }
+  }
+
+  // Combine selected RowVectors into a single RowVector
+  auto input = mergeRowVectors(selectedInputs, inputs_[0]->pool());
+
+  // Remove processed inputs
+  inputs_.erase(inputs_.begin(), inputs_.begin() + selectedInputs.size());
+  currentOutputSize_ -= totalSize;
 
   // Early return if no input
   if (input->size() == 0) {

From 327f7180239487e719b43a8dd2e07129abcae3c1 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 25 Mar 2025 13:08:26 +0000
Subject: [PATCH 25/37] Misc review changes - Removing missed kConstant changes
 when not applicable - auto* instead of auto

---
 velox/experimental/cudf/exec/ToCudf.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 1ed61296639..60224a71750 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -93,21 +93,21 @@ bool CompileState::compile() {
     auto replacingOperatorIndex = operatorIndex + operatorsOffset;
     VELOX_CHECK(oper);
 
-    bool const kPreviousOperatorIsNotGpu =
+    const bool previousOperatorIsNotGpu =
         (operatorIndex > 0 and !isSupportedGpuOperators[operatorIndex - 1]);
-    bool const kNextOperatorIsNotGpu =
+    const bool nextOperatorIsNotGpu =
         (operatorIndex < operators.size() - 1 and
          !isSupportedGpuOperators[operatorIndex + 1]);
 
     auto id = oper->operatorId();
-    if (kPreviousOperatorIsNotGpu and acceptsGpuInput(oper)) {
+    if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) {
       auto planNode = getPlanNode(oper->planNodeId());
       replaceOp.push_back(std::make_unique<CudfFromVelox>(
           id, planNode->outputType(), ctx, planNode->id() + "-from-velox"));
       replaceOp.back()->initialize();
     }
 
-    if (auto orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
+    if (auto* orderByOp = dynamic_cast<exec::OrderBy*>(oper)) {
       auto id = orderByOp->operatorId();
       auto planNode = std::dynamic_pointer_cast<const core::OrderByNode>(
           getPlanNode(orderByOp->planNodeId()));
@@ -116,7 +116,7 @@ bool CompileState::compile() {
       replaceOp.back()->initialize();
     }
 
-    if (kNextOperatorIsNotGpu and producesGpuOutput(oper)) {
+    if (nextOperatorIsNotGpu and producesGpuOutput(oper)) {
       auto planNode = getPlanNode(oper->planNodeId());
       replaceOp.push_back(std::make_unique<CudfToVelox>(
           id, planNode->outputType(), ctx, planNode->id() + "-to-velox"));

From 5754b3c6db5372553330476e629d07abd7e04a71 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Mar 2025 01:43:44 -0500
Subject: [PATCH 26/37] replace env variable with gflags and CudfOptions

---
 velox/experimental/cudf/exec/ToCudf.cpp | 12 ++++++------
 velox/experimental/cudf/exec/ToCudf.h   | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 60224a71750..85cefa4afaf 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -33,6 +33,9 @@
 
 #include <iostream>
 
+DEFINE_bool(velox_cudf_enabled, true, "Enable cuDF-Velox acceleration");
+DEFINE_string(velox_cudf_memory_resource, "async", "Memory resource for cuDF");
+
 namespace facebook::velox::cudf_velox {
 
 namespace {
@@ -176,21 +179,18 @@ struct CudfDriverAdapter {
 
 static bool isCudfRegistered = false;
 
-void registerCudf() {
+void registerCudf(const CudfOptions& options) {
   if (cudfIsRegistered()) {
     return;
   }
-
-  const char* envCudfDisabled = std::getenv("VELOX_CUDF_DISABLED");
-  if (envCudfDisabled != nullptr && std::stoi(envCudfDisabled)) {
+  if (!options.cudfEnabled) {
     return;
   }
 
   CUDF_FUNC_RANGE();
   cudaFree(nullptr); // Initialize CUDA context at startup
 
-  const char* envCudfMr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE");
-  auto mrMode = envCudfMr != nullptr ? envCudfMr : "async";
+  const std::string mrMode = options.cudfMemoryResource;
   auto mr = cudf_velox::createMemoryResource(mrMode);
   cudf::set_current_device_resource(mr.get());
   CudfDriverAdapter cda{mr};
diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h
index f19dd0d107f..0f45354d8fa 100644
--- a/velox/experimental/cudf/exec/ToCudf.h
+++ b/velox/experimental/cudf/exec/ToCudf.h
@@ -19,6 +19,11 @@
 #include "velox/exec/Driver.h"
 #include "velox/exec/Operator.h"
 
+#include <gflags/gflags.h>
+
+DECLARE_bool(velox_cudf_enabled);
+DECLARE_string(velox_cudf_memory_resource);
+
 namespace facebook::velox::cudf_velox {
 
 static const std::string kCudfAdapterName = "cuDF";
@@ -44,8 +49,16 @@ class CompileState {
   const std::vector<core::PlanNodePtr>& planNodes_;
 };
 
+struct CudfOptions {
+  bool cudfEnabled = FLAGS_velox_cudf_enabled;
+  std::string cudfMemoryResource = FLAGS_velox_cudf_memory_resource;
+  static CudfOptions defaultOptions() {
+    return CudfOptions();
+  }
+};
+
 /// Registers adapter to add cuDF operators to Drivers.
-void registerCudf();
+void registerCudf(const CudfOptions& options = CudfOptions::defaultOptions());
 void unregisterCudf();
 
 /// Returns true if cuDF is registered.

From 538398e6f522a3cee0384347fe309737ca16a491 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Mar 2025 01:59:45 -0500
Subject: [PATCH 27/37] replace gpu batch size env variable with a QueryConfig
 entry

---
 velox/experimental/cudf/exec/CudfConversion.cpp | 13 ++++++-------
 velox/experimental/cudf/exec/CudfConversion.h   |  2 ++
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
index f86589611cb..b84dc98f183 100644
--- a/velox/experimental/cudf/exec/CudfConversion.cpp
+++ b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -50,13 +50,11 @@ RowVectorPtr mergeRowVectors(
   return copy;
 }
 
-cudf::size_type preferredGpuBatchSizeRows() {
+cudf::size_type preferredGpuBatchSizeRows(
+    const facebook::velox::core::QueryConfig& queryConfig) {
   constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000;
-  const char* envCudfGpuBatchSizeRows =
-      std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS");
-  const auto batchSize = envCudfGpuBatchSizeRows != nullptr
-      ? std::stoll(envCudfGpuBatchSizeRows)
-      : kDefaultGpuBatchSizeRows;
+  const auto batchSize = queryConfig.get<int32_t>(
+      CudfFromVelox::kGpuBatchSizeRows, kDefaultGpuBatchSizeRows);
   VELOX_CHECK_GT(batchSize, 0, "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be > 0");
   VELOX_CHECK_LE(
       batchSize,
@@ -96,7 +94,8 @@ void CudfFromVelox::addInput(RowVectorPtr input) {
 
 RowVectorPtr CudfFromVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  const auto targetOutputSize = preferredGpuBatchSizeRows();
+  const auto targetOutputSize =
+      preferredGpuBatchSizeRows(operatorCtx_->driverCtx()->queryConfig());
 
   finished_ = noMoreInput_ && inputs_.empty();
 
diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
index 16259aa2f8f..8d649d77328 100644
--- a/velox/experimental/cudf/exec/CudfConversion.h
+++ b/velox/experimental/cudf/exec/CudfConversion.h
@@ -33,6 +33,8 @@ namespace facebook::velox::cudf_velox {
 
 class CudfFromVelox : public exec::Operator, public NvtxHelper {
  public:
+  static constexpr const char* kGpuBatchSizeRows = "velox.cudf.gpu_batch_size_rows";
+
   CudfFromVelox(
       int32_t operatorId,
       RowTypePtr outputType,

From 7570d56d06d70f7211de382161b7ae6d7f19be14 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 26 Mar 2025 08:05:35 +0000
Subject: [PATCH 28/37] Add back optional debug printing of plans

---
 velox/experimental/cudf/exec/ToCudf.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 85cefa4afaf..8edc9dd6301 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -35,6 +35,7 @@
 
 DEFINE_bool(velox_cudf_enabled, true, "Enable cuDF-Velox acceleration");
 DEFINE_string(velox_cudf_memory_resource, "async", "Memory resource for cuDF");
+DEFINE_bool(velox_cudf_debug, false, "Enable debug printing");
 
 namespace facebook::velox::cudf_velox {
 
@@ -51,6 +52,15 @@ bool CompileState::compile() {
   auto operators = driver_.operators();
   auto& nodes = planNodes_;
 
+  if (FLAGS_velox_cudf_debug) {
+    std::cout << "Operators before adapting for cuDF:" << std::endl;
+    std::cout << "Number of operators: " << operators.size() << std::endl;
+    for (auto& op : operators) {
+      std::cout << "  Operator: ID " << op->operatorId() << ": "
+                << op->toString() << std::endl;
+    }
+  }
+
   // Make sure operator states are initialized.  We will need to inspect some of
   // them during the transformation.
   driver_.initializeOperators();
@@ -137,6 +147,16 @@ bool CompileState::compile() {
     }
   }
 
+  if (FLAGS_velox_cudf_debug) {
+    std::cout << "Operators after adapting for cuDF:" << std::endl;
+    operators = driver_.operators();
+    std::cout << "Number of new operators: " << operators.size() << std::endl;
+    for (auto& op : operators) {
+      std::cout << "  Operator: ID " << op->operatorId() << ": "
+                << op->toString() << std::endl;
+    }
+  }
+
   return replacementsMade;
 }
 

From e6333715e8a61ca31749715a91ac3288096823d0 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 26 Mar 2025 19:20:32 +0000
Subject: [PATCH 29/37] Misc review changes

---
 velox/experimental/cudf/exec/CudfConversion.h     | 3 ++-
 velox/experimental/cudf/exec/ToCudf.cpp           | 8 ++++----
 velox/experimental/cudf/exec/Utilities.cpp        | 2 +-
 velox/experimental/cudf/exec/VeloxCudfInterop.cpp | 5 ++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
index 8d649d77328..16ca33d786c 100644
--- a/velox/experimental/cudf/exec/CudfConversion.h
+++ b/velox/experimental/cudf/exec/CudfConversion.h
@@ -33,7 +33,8 @@ namespace facebook::velox::cudf_velox {
 
 class CudfFromVelox : public exec::Operator, public NvtxHelper {
  public:
-  static constexpr const char* kGpuBatchSizeRows = "velox.cudf.gpu_batch_size_rows";
+  static constexpr const char* kGpuBatchSizeRows =
+      "velox.cudf.gpu_batch_size_rows";
 
   CudfFromVelox(
       int32_t operatorId,
diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 8edc9dd6301..3768ae53be7 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -53,8 +53,8 @@ bool CompileState::compile() {
   auto& nodes = planNodes_;
 
   if (FLAGS_velox_cudf_debug) {
-    std::cout << "Operators before adapting for cuDF:" << std::endl;
-    std::cout << "Number of operators: " << operators.size() << std::endl;
+    std::cout << "Operators before adapting for cuDF: count ["
+              << operators.size() << "]" << std::endl;
     for (auto& op : operators) {
       std::cout << "  Operator: ID " << op->operatorId() << ": "
                 << op->toString() << std::endl;
@@ -148,9 +148,9 @@ bool CompileState::compile() {
   }
 
   if (FLAGS_velox_cudf_debug) {
-    std::cout << "Operators after adapting for cuDF:" << std::endl;
     operators = driver_.operators();
-    std::cout << "Number of new operators: " << operators.size() << std::endl;
+    std::cout << "Operators after adapting for cuDF: count ["
+              << operators.size() << "]" << std::endl;
     for (auto& op : operators) {
       std::cout << "  Operator: ID " << op->operatorId() << ": "
                 << op->toString() << std::endl;
diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp
index 99a597f2e3f..f861724ee63 100644
--- a/velox/experimental/cudf/exec/Utilities.cpp
+++ b/velox/experimental/cudf/exec/Utilities.cpp
@@ -81,7 +81,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> createMemoryResource(
     return makeManagedMr();
   if (mode == "managed_pool")
     return makeManagedPoolMr();
-  throw cudf::logic_error(
+  VELOX_FAIL(
       "Unknown memory resource mode: " + std::string(mode) +
       "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
 }
diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index 8a2d6a3f46c..fe1dd8b25c1 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -105,9 +105,8 @@ void toSignedIntFormat(char* format) {
     default:
       return;
   }
-  printf(
-      "Warning: arrowSchema.format: %s, unsigned is treated as signed indices\n",
-      format);
+  LOG(WARNING) << "arrowSchema.format: " << format
+               << ", unsigned is treated as signed indices";
 }
 
 // Changes all unsigned indices to signed indices for dictionary columns from

From 86738ed23d0b6a3f64ace685e49d9c7358fedb32 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 31 Mar 2025 18:24:52 +0000
Subject: [PATCH 30/37] Add clang tidy

---
 velox/experimental/cudf/.clang-tidy | 54 +++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 velox/experimental/cudf/.clang-tidy

diff --git a/velox/experimental/cudf/.clang-tidy b/velox/experimental/cudf/.clang-tidy
new file mode 100644
index 00000000000..ec5b4e40784
--- /dev/null
+++ b/velox/experimental/cudf/.clang-tidy
@@ -0,0 +1,54 @@
+---
+Checks: >
+  readability-identifier-naming,
+  modernize-use-nullptr,
+  modernize-use-using
+
+HeaderFilterRegex: '.*'
+
+WarningsAsErrors: ''
+
+CheckOptions:
+  # Naming conventions as explicitly stated in CODING_STYLE.md
+  - key: readability-identifier-naming.ClassCase
+    value: CamelCase
+  - key: readability-identifier-naming.StructCase
+    value: CamelCase
+  - key: readability-identifier-naming.EnumCase
+    value: CamelCase
+  - key: readability-identifier-naming.TypeAliasCase
+    value: CamelCase
+  - key: readability-identifier-naming.TypeTemplateParameterCase
+    value: CamelCase
+  - key: readability-identifier-naming.FunctionCase
+    value: camelBack
+  - key: readability-identifier-naming.VariableCase
+    value: camelBack
+  - key: readability-identifier-naming.ParameterCase
+    value: camelBack
+  - key: readability-identifier-naming.PrivateMemberCase
+    value: camelBack
+  - key: readability-identifier-naming.PrivateMemberSuffix
+    value: _
+  - key: readability-identifier-naming.ProtectedMemberCase
+    value: camelBack
+  - key: readability-identifier-naming.ProtectedMemberSuffix
+    value: _
+  - key: readability-identifier-naming.MacroDefinitionCase
+    value: UPPER_CASE
+  - key: readability-identifier-naming.NamespaceCase
+    value: lower_case
+  - key: readability-identifier-naming.StaticConstantPrefix
+    value: k
+  - key: readability-identifier-naming.EnumConstantCase
+    value: CamelCase
+  - key: readability-identifier-naming.EnumConstantPrefix
+    value: k
+    
+  # Use nullptr instead of NULL or 0
+  - key: modernize-use-nullptr.NullMacros
+    value: 'NULL'
+    
+  # Prefer enum class over enum
+  - key: modernize-use-using.IgnoreUsingStdAllocator
+    value: 1 
\ No newline at end of file

From 7b18528e7eb71b9df087d23a7e125524d916ce5f Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 31 Mar 2025 18:46:47 +0000
Subject: [PATCH 31/37] remove aacidental flags added to all of velox

---
 CMake/resolve_dependency_modules/cudf.cmake | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake
index 4c9d015dbf1..e300a87adf3 100644
--- a/CMake/resolve_dependency_modules/cudf.cmake
+++ b/CMake/resolve_dependency_modules/cudf.cmake
@@ -53,13 +53,6 @@ set(BUILD_TESTS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
 set(BUILD_SHARED_LIBS ON)
 
-# cudf sets all warnings as errors, and therefore fails to compile with velox
-# expanded set of warnings. We selectively disable problematic warnings just for
-# cudf
-string(
-  APPEND CMAKE_CXX_FLAGS
-  " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy")
-
 FetchContent_Declare(
   rapids-cmake
   URL ${VELOX_rapids_cmake_SOURCE_URL}
@@ -87,5 +80,13 @@ FetchContent_Declare(
   UPDATE_DISCONNECTED 1)
 
 FetchContent_MakeAvailable(cudf)
+
+# cudf sets all warnings as errors, and therefore fails to compile with velox
+# expanded set of warnings. We selectively disable problematic warnings just for
+# cudf
+target_compile_options(
+  cudf PRIVATE -Wno-non-virtual-dtor -Wno-missing-field-initializers
+               -Wno-deprecated-copy)
+
 unset(BUILD_SHARED_LIBS)
 endblock()

From 7f723dbedf4016dd53d033b9b264e68a4eb19a64 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 7 Apr 2025 11:21:22 +0000
Subject: [PATCH 32/37] cmake min required

---
 CMake/resolve_dependency_modules/cudf.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake
index e300a87adf3..56f7b9bf1d1 100644
--- a/CMake/resolve_dependency_modules/cudf.cmake
+++ b/CMake/resolve_dependency_modules/cudf.cmake
@@ -14,6 +14,9 @@
 
 include_guard(GLOBAL)
 
+# 3.30.4 is the minimum version required by cudf
+cmake_minimum_required(VERSION 3.30.4)
+
 set(VELOX_rapids_cmake_VERSION 25.04)
 set(VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM
     458c14eaff9000067b32d65c8c914f4521090ede7690e16eb57035ce731386db)

From bee2494bd8bbf4819f8282d0cf0888aec7d0a4dd Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 7 Apr 2025 14:45:06 +0000
Subject: [PATCH 33/37] Make sure last operator from task produces velox
 RowVector

---
 velox/experimental/cudf/exec/ToCudf.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 3768ae53be7..98f91ab7cd3 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -111,6 +111,7 @@ bool CompileState::compile() {
     const bool nextOperatorIsNotGpu =
         (operatorIndex < operators.size() - 1 and
          !isSupportedGpuOperators[operatorIndex + 1]);
+    const bool isLastOperatorOfTask = oper->planNodeId() == nodes.back()->id();
 
     auto id = oper->operatorId();
     if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) {
@@ -129,7 +130,8 @@ bool CompileState::compile() {
       replaceOp.back()->initialize();
     }
 
-    if (nextOperatorIsNotGpu and producesGpuOutput(oper)) {
+    if (producesGpuOutput(oper) and
+        (isLastOperatorOfTask or nextOperatorIsNotGpu)) {
       auto planNode = getPlanNode(oper->planNodeId());
       replaceOp.push_back(std::make_unique<CudfToVelox>(
           id, planNode->outputType(), ctx, planNode->id() + "-to-velox"));

From 55752056c9b9f1313c9328ef7c134ec2a00382ad Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 8 Apr 2025 12:56:29 +0000
Subject: [PATCH 34/37] Cudf driver adapter without storing plan nodes

---
 velox/experimental/cudf/exec/ToCudf.cpp | 42 ++++++-------------------
 velox/experimental/cudf/exec/ToCudf.h   |  9 ++----
 2 files changed, 13 insertions(+), 38 deletions(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 98f91ab7cd3..46219012eb1 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -50,7 +50,6 @@ bool isAnyOf(const Base* p) {
 
 bool CompileState::compile() {
   auto operators = driver_.operators();
-  auto& nodes = planNodes_;
 
   if (FLAGS_velox_cudf_debug) {
     std::cout << "Operators before adapting for cuDF: count ["
@@ -70,12 +69,16 @@ bool CompileState::compile() {
 
   // Get plan node by id lookup.
   auto getPlanNode = [&](const core::PlanNodeId& id) {
+    auto& nodes = driverFactory_.planNodes;
     auto it =
         std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) {
           return node->id() == id;
         });
-    VELOX_CHECK(it != nodes.end());
-    return *it;
+    if (it != nodes.end()) {
+      return *it;
+    }
+    VELOX_CHECK(driverFactory_.consumerNode->id() == id);
+    return driverFactory_.consumerNode;
   };
 
   auto isSupportedGpuOperator = [](const exec::Operator* op) {
@@ -111,7 +114,6 @@ bool CompileState::compile() {
     const bool nextOperatorIsNotGpu =
         (operatorIndex < operators.size() - 1 and
          !isSupportedGpuOperators[operatorIndex + 1]);
-    const bool isLastOperatorOfTask = oper->planNodeId() == nodes.back()->id();
 
     auto id = oper->operatorId();
     if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) {
@@ -130,8 +132,7 @@ bool CompileState::compile() {
       replaceOp.back()->initialize();
     }
 
-    if (producesGpuOutput(oper) and
-        (isLastOperatorOfTask or nextOperatorIsNotGpu)) {
+    if (producesGpuOutput(oper) and nextOperatorIsNotGpu) {
       auto planNode = getPlanNode(oper->planNodeId());
       replaceOp.push_back(std::make_unique<CudfToVelox>(
           id, planNode->outputType(), ctx, planNode->id() + "-to-velox"));
@@ -164,39 +165,16 @@ bool CompileState::compile() {
 
 struct CudfDriverAdapter {
   std::shared_ptr<rmm::mr::device_memory_resource> mr_;
-  std::shared_ptr<std::vector<core::PlanNodePtr>> planNodes_;
 
   CudfDriverAdapter(std::shared_ptr<rmm::mr::device_memory_resource> mr)
-      : mr_(mr) {
-    planNodes_ = std::make_shared<std::vector<core::PlanNodePtr>>();
-  }
+      : mr_(mr) {}
 
   // Call operator needed by DriverAdapter
   bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) {
-    auto state = CompileState(factory, driver, *planNodes_);
-    // Stored planNodes_ from inspect.
+    auto state = CompileState(factory, driver);
     auto res = state.compile();
     return res;
   }
-
-  // Iterate recursively and store them in the planNodes_.
-  void storePlanNodes(const core::PlanNodePtr& planNode) {
-    const auto& sources = planNode->sources();
-    for (int32_t i = 0; i < sources.size(); ++i) {
-      storePlanNodes(sources[i]);
-    }
-    planNodes_->push_back(planNode);
-  }
-
-  // Call operator needed by plan inspection
-  void operator()(const core::PlanFragment& planFragment) {
-    // signature: std::function<void(const core::PlanFragment&)> inspect;
-    // call: adapter.inspect(planFragment);
-    planNodes_->clear();
-    if (planNodes_) {
-      storePlanNodes(planFragment.planNode);
-    }
-  }
 };
 
 static bool isCudfRegistered = false;
@@ -216,7 +194,7 @@ void registerCudf(const CudfOptions& options) {
   auto mr = cudf_velox::createMemoryResource(mrMode);
   cudf::set_current_device_resource(mr.get());
   CudfDriverAdapter cda{mr};
-  exec::DriverAdapter cudfAdapter{kCudfAdapterName, cda, cda};
+  exec::DriverAdapter cudfAdapter{kCudfAdapterName, {}, cda};
   exec::DriverFactory::registerAdapter(cudfAdapter);
   isCudfRegistered = true;
 }
diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h
index 0f45354d8fa..63fcf0d5dd7 100644
--- a/velox/experimental/cudf/exec/ToCudf.h
+++ b/velox/experimental/cudf/exec/ToCudf.h
@@ -23,6 +23,7 @@
 
 DECLARE_bool(velox_cudf_enabled);
 DECLARE_string(velox_cudf_memory_resource);
+DECLARE_bool(velox_cudf_debug);
 
 namespace facebook::velox::cudf_velox {
 
@@ -30,11 +31,8 @@ static const std::string kCudfAdapterName = "cuDF";
 
 class CompileState {
  public:
-  CompileState(
-      const exec::DriverFactory& driverFactory,
-      exec::Driver& driver,
-      std::vector<core::PlanNodePtr>& planNodes)
-      : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {}
+  CompileState(const exec::DriverFactory& driverFactory, exec::Driver& driver)
+      : driverFactory_(driverFactory), driver_(driver) {}
 
   exec::Driver& driver() {
     return driver_;
@@ -46,7 +44,6 @@ class CompileState {
 
   const exec::DriverFactory& driverFactory_;
   exec::Driver& driver_;
-  const std::vector<core::PlanNodePtr>& planNodes_;
 };
 
 struct CudfOptions {

From 01f1aaf76df6d384bc025d13fd7095b342843f5a Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 8 Apr 2025 20:01:48 +0000
Subject: [PATCH 35/37] re-fix conversion to RowVector in sink

---
 velox/experimental/cudf/exec/ToCudf.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp
index 46219012eb1..55fb8a27849 100644
--- a/velox/experimental/cudf/exec/ToCudf.cpp
+++ b/velox/experimental/cudf/exec/ToCudf.cpp
@@ -114,6 +114,8 @@ bool CompileState::compile() {
     const bool nextOperatorIsNotGpu =
         (operatorIndex < operators.size() - 1 and
          !isSupportedGpuOperators[operatorIndex + 1]);
+    const bool isLastOperatorOfTask =
+        driverFactory_.outputDriver and operatorIndex == operators.size() - 1;
 
     auto id = oper->operatorId();
     if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) {
@@ -132,7 +134,8 @@ bool CompileState::compile() {
       replaceOp.back()->initialize();
     }
 
-    if (producesGpuOutput(oper) and nextOperatorIsNotGpu) {
+    if (producesGpuOutput(oper) and
+        (nextOperatorIsNotGpu or isLastOperatorOfTask)) {
       auto planNode = getPlanNode(oper->planNodeId());
       replaceOp.push_back(std::make_unique<CudfToVelox>(
           id, planNode->outputType(), ctx, planNode->id() + "-to-velox"));

From 652825fdc557367bbffb737ee4b4b68c899ab368 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 9 Apr 2025 18:00:21 -0500
Subject: [PATCH 36/37] remove fixDictionaryIndices

removing fixDictionaryIndices after cudf changed dictionary indices to
signed
---
 .../cudf/exec/VeloxCudfInterop.cpp            | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
index fe1dd8b25c1..7a2ad969397 100644
--- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
+++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp
@@ -87,41 +87,6 @@ std::unique_ptr<cudf::table> toCudfTable(
 
 namespace {
 
-void toSignedIntFormat(char* format) {
-  VELOX_CHECK_NOT_NULL(format);
-  switch (format[0]) {
-    case 'C':
-      format[0] = 'c';
-      break;
-    case 'S':
-      format[0] = 's';
-      break;
-    case 'I':
-      format[0] = 'i';
-      break;
-    case 'L':
-      format[0] = 'l';
-      break;
-    default:
-      return;
-  }
-  LOG(WARNING) << "arrowSchema.format: " << format
-               << ", unsigned is treated as signed indices";
-}
-
-// Changes all unsigned indices to signed indices for dictionary columns from
-// cudf which uses unsigned indices, but velox uses signed indices.
-void fixDictionaryIndices(ArrowSchema& arrowSchema) {
-  if (arrowSchema.dictionary != nullptr) {
-    toSignedIntFormat(const_cast<char*>(arrowSchema.format));
-    fixDictionaryIndices(*arrowSchema.dictionary);
-  }
-  for (size_t i = 0; i < arrowSchema.n_children; ++i) {
-    VELOX_CHECK_NOT_NULL(arrowSchema.children[i]);
-    fixDictionaryIndices(*arrowSchema.children[i]);
-  }
-}
-
 RowVectorPtr toVeloxColumn(
     const cudf::table_view& table,
     memory::MemoryPool* pool,
@@ -131,9 +96,6 @@ RowVectorPtr toVeloxColumn(
   auto& arrowArray = arrowDeviceArray->array;
 
   auto arrowSchema = cudf::to_arrow_schema(table, metadata);
-  // Hack to convert unsigned indices to signed indices for dictionary columns
-  fixDictionaryIndices(*arrowSchema);
-
   auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool);
   // BaseVector to RowVector
   auto castedPtr =

From 13a4a8769ca25ca8d148be606770fbd05ac5ac46 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 11 Apr 2025 07:50:26 +0000
Subject: [PATCH 37/37] Remove codeowners for now

---
 .github/CODEOWNERS | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 649e3694e27..c24be384421 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -32,9 +32,6 @@ scripts/ @assignUser @majetideepak
 # Breeze 
 velox/experimental/breeze @dreveman
 
-# cuDF
-velox/experimental/cudf @bdice @karthikeyann @devavret
-
 # Parquet
 velox/dwio/parquet/ @majetideepak