From cf55f951a06e7c9b181021bd2dc2e6d2b3f954f0 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 11 Mar 2025 14:58:08 +0000 Subject: [PATCH 01/37] Add minimal code that demonstrates cudf integration --- CMake/resolve_dependency_modules/cudf.cmake | 48 ++ CMakeLists.txt | 18 +- Makefile | 6 +- velox/CMakeLists.txt | 3 + velox/experimental/cudf/CMakeLists.txt | 21 + velox/experimental/cudf/exec/CMakeLists.txt | 34 ++ .../experimental/cudf/exec/CudfConversion.cpp | 199 +++++++ velox/experimental/cudf/exec/CudfConversion.h | 97 ++++ velox/experimental/cudf/exec/CudfOrderBy.cpp | 128 +++++ velox/experimental/cudf/exec/CudfOrderBy.h | 69 +++ velox/experimental/cudf/exec/NvtxHelper.h | 66 +++ velox/experimental/cudf/exec/ToCudf.cpp | 360 +++++++++++++ velox/experimental/cudf/exec/ToCudf.h | 52 ++ velox/experimental/cudf/exec/Utilities.cpp | 148 +++++ velox/experimental/cudf/exec/Utilities.h | 59 ++ .../cudf/exec/VeloxCudfInterop.cpp | 508 ++++++++++++++++++ .../experimental/cudf/exec/VeloxCudfInterop.h | 64 +++ velox/experimental/cudf/tests/CMakeLists.txt | 33 ++ velox/experimental/cudf/tests/OrderByTest.cpp | 417 ++++++++++++++ velox/experimental/cudf/vector/CMakeLists.txt | 26 + velox/experimental/cudf/vector/CudfVector.cpp | 21 + velox/experimental/cudf/vector/CudfVector.h | 73 +++ 22 files changed, 2443 insertions(+), 7 deletions(-) create mode 100644 CMake/resolve_dependency_modules/cudf.cmake create mode 100644 velox/experimental/cudf/CMakeLists.txt create mode 100644 velox/experimental/cudf/exec/CMakeLists.txt create mode 100644 velox/experimental/cudf/exec/CudfConversion.cpp create mode 100644 velox/experimental/cudf/exec/CudfConversion.h create mode 100644 velox/experimental/cudf/exec/CudfOrderBy.cpp create mode 100644 velox/experimental/cudf/exec/CudfOrderBy.h create mode 100644 velox/experimental/cudf/exec/NvtxHelper.h create mode 100644 velox/experimental/cudf/exec/ToCudf.cpp create mode 100644 velox/experimental/cudf/exec/ToCudf.h create mode 100644 velox/experimental/cudf/exec/Utilities.cpp create mode 100644 velox/experimental/cudf/exec/Utilities.h create mode 100644 velox/experimental/cudf/exec/VeloxCudfInterop.cpp create mode 100644 velox/experimental/cudf/exec/VeloxCudfInterop.h create mode 100644 velox/experimental/cudf/tests/CMakeLists.txt create mode 100644 velox/experimental/cudf/tests/OrderByTest.cpp create mode 100644 velox/experimental/cudf/vector/CMakeLists.txt create mode 100644 velox/experimental/cudf/vector/CudfVector.cpp create mode 100644 velox/experimental/cudf/vector/CudfVector.h diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake new file mode 100644 index 00000000000..a6abdb61254 --- /dev/null +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -0,0 +1,48 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include_guard(GLOBAL) + +set(VELOX_cudf_VERSION 25.04) +set(VELOX_cudf_BUILD_SHA256_CHECKSUM + e5a1900dfaf23dab2c5808afa17a2d04fa867d2892ecec1cb37908f3b73715c2) +set(VELOX_cudf_SOURCE_URL + "https://github.com/rapidsai/cudf/archive/4c1c99011da2c23856244e05adda78ba66697105.tar.gz" +) +velox_resolve_dependency_url(cudf) + +# Use block so we don't leak variables +block(SCOPE_FOR VARIABLES) +# Setup libcudf build to not have testing components +set(BUILD_TESTS OFF) +set(CUDF_BUILD_TESTUTIL OFF) +set(BUILD_SHARED_LIBS ON) + +# cudf sets all warnings as errors, and therefore fails to compile with velox +# expanded set of warnings. We selectively disable problematic warnings just for +# cudf +string( + APPEND CMAKE_CXX_FLAGS + " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy") + +FetchContent_Declare( + cudf + URL ${VELOX_cudf_SOURCE_URL} + URL_HASH ${VELOX_cudf_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR cpp + UPDATE_DISCONNECTED 1) + +FetchContent_MakeAvailable(cudf) +unset(BUILD_SHARED_LIBS) +endblock() diff --git a/CMakeLists.txt b/CMakeLists.txt index aae0cfff990..173a40fe1d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,6 +235,7 @@ if(VELOX_ENABLE_CCACHE message(STATUS "Using ccache: ${CCACHE_FOUND}") set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_FOUND}) set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_FOUND}) + set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_FOUND}) # keep comments as they might matter to the compiler set(ENV{CCACHE_COMMENTS} "1") endif() @@ -368,6 +369,8 @@ if(ENABLE_ALL_WARNINGS) -Wno-unused-parameter \ -Wno-sign-compare \ -Wno-ignored-qualifiers \ + -Wno-missing-field-initializers \ + -Wno-deprecated-copy \ ${KNOWN_COMPILER_SPECIFIC_WARNINGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra ${KNOWN_WARNINGS}") @@ -375,7 +378,12 @@ endif() message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}") -if(${VELOX_ENABLE_GPU}) +if(NOT TARGET fmt::fmt) + velox_set_source(fmt) + velox_resolve_dependency(fmt 9.0.0) +endif() + +if(VELOX_ENABLE_GPU) enable_language(CUDA) # Determine CUDA_ARCHITECTURES automatically. cmake_policy(SET CMP0104 NEW) @@ -387,6 +395,11 @@ if(${VELOX_ENABLE_GPU}) add_compile_options("$<$:-G>") endif() find_package(CUDAToolkit REQUIRED) + if(VELOX_ENABLE_CUDF) + set(VELOX_ENABLE_ARROW ON) + velox_set_source(cudf) + velox_resolve_dependency(cudf) + endif() endif() # Set after the test of the CUDA compiler. Otherwise, the test fails with @@ -458,9 +471,6 @@ else() endif() velox_resolve_dependency(glog) -velox_set_source(fmt) -velox_resolve_dependency(fmt 9.0.0) - if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR}) # DWIO needs all sorts of stream compression libraries. # diff --git a/Makefile b/Makefile index bb6f9af3d19..7306089253b 100644 --- a/Makefile +++ b/Makefile @@ -99,7 +99,7 @@ cmake: #: Use CMake to create a Makefile build system ${EXTRA_CMAKE_FLAGS} cmake-gpu: - $(MAKE) EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" cmake + $(MAKE) EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON" cmake build: #: Build the software based in BUILD_DIR and BUILD_TYPE variables cmake --build $(BUILD_BASE_DIR)/$(BUILD_DIR) -j $(NUM_THREADS) @@ -123,11 +123,11 @@ minimal: #: Minimal build $(MAKE) build BUILD_DIR=release gpu: #: Build with GPU support - $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON" $(MAKE) build BUILD_DIR=release gpu_debug: #: Build with debugging symbols and GPU support - $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON" $(MAKE) build BUILD_DIR=debug dwio: #: Minimal build with dwio enabled. diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index b51e2a77073..09a4e7c58cf 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -69,6 +69,9 @@ if(${VELOX_ENABLE_DUCKDB}) endif() if(${VELOX_ENABLE_GPU}) + if(${VELOX_ENABLE_CUDF}) + add_subdirectory(experimental/cudf) + endif() add_subdirectory(experimental/gpu) add_subdirectory(experimental/wave) add_subdirectory(external/jitify) diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt new file mode 100644 index 00000000000..96fcdb0d557 --- /dev/null +++ b/velox/experimental/cudf/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_subdirectory(exec) +add_subdirectory(connectors) +add_subdirectory(vector) + +if(VELOX_BUILD_TESTING) + add_subdirectory(tests) +endif() diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt new file mode 100644 index 00000000000..c47d3a5065a --- /dev/null +++ b/velox/experimental/cudf/exec/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library( + velox_cudf_exec + CudfConversion.cpp + CudfOrderBy.cpp + ToCudf.cpp + Utilities.cpp + VeloxCudfInterop.cpp) + +set_target_properties( + velox_cudf_exec + PROPERTIES CUDA_ARCHITECTURES native) + +target_link_libraries( + velox_cudf_exec + cudf::cudf + arrow + velox_arrow_bridge + velox_exception + velox_common_base + velox_exec) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp new file mode 100644 index 00000000000..68b3ac283de --- /dev/null +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/exec/Driver.h" +#include "velox/exec/Operator.h" +#include "velox/vector/ComplexVector.h" + +#include +#include +#include + +#include "velox/experimental/cudf/exec/CudfConversion.h" +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/exec/Utilities.h" +#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" +#include "velox/experimental/cudf/vector/CudfVector.h" + +namespace facebook::velox::cudf_velox { + +namespace { +// Concatenate multiple RowVectors into a single RowVector. +// Copied from AggregationFuzzer.cpp. +RowVectorPtr mergeRowVectors( + const std::vector& results, + velox::memory::MemoryPool* pool) { + VELOX_NVTX_FUNC_RANGE(); + auto totalCount = 0; + for (const auto& result : results) { + totalCount += result->size(); + } + auto copy = + BaseVector::create(results[0]->type(), totalCount, pool); + auto copyCount = 0; + for (const auto& result : results) { + copy->copy(result.get(), copyCount, 0, result->size()); + copyCount += result->size(); + } + return copy; +} + +cudf::size_type preferred_gpu_batch_size_rows() { + constexpr cudf::size_type default_gpu_batch_size_rows = 100000; + const char* env_cudf_gpu_batch_size_rows = + std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS"); + return env_cudf_gpu_batch_size_rows != nullptr + ? std::stoi(env_cudf_gpu_batch_size_rows) + : default_gpu_batch_size_rows; +} +} // namespace + +CudfFromVelox::CudfFromVelox( + int32_t operatorId, + RowTypePtr outputType, + exec::DriverCtx* driverCtx, + std::string planNodeId) + : exec::Operator( + driverCtx, + outputType, + operatorId, + planNodeId, + "CudfFromVelox"), + NvtxHelper(nvtx3::rgb{255, 140, 0}, operatorId) {} // Orange + +void CudfFromVelox::addInput(RowVectorPtr input) { + VELOX_NVTX_OPERATOR_FUNC_RANGE(); + if (input != nullptr) { + if (input->size() > 0) { + // Materialize lazy vectors + for (auto& child : input->children()) { + child->loadedVector(); + } + input->loadedVector(); + + // Accumulate inputs + inputs_.push_back(input); + current_output_size_ += input->size(); + } + } +} + +RowVectorPtr CudfFromVelox::getOutput() { + VELOX_NVTX_OPERATOR_FUNC_RANGE(); + auto const target_output_size = preferred_gpu_batch_size_rows(); + auto const exit_early = finished_ or + (current_output_size_ < target_output_size and not noMoreInput_) or + inputs_.empty(); + finished_ = noMoreInput_; + if (exit_early) { + return nullptr; + } + + // Combine all input RowVectors into a single RowVector and clear inputs + auto input = mergeRowVectors(inputs_, inputs_[0]->pool()); + inputs_.clear(); + current_output_size_ = 0; + + // Early return if no input + if (input->size() == 0) { + return nullptr; + } + + // Get a stream from the global stream pool + auto stream = cudfGlobalStreamPool().get_stream(); + + // Convert RowVector to cudf table + auto tbl = with_arrow::to_cudf_table(input, input->pool(), stream); + + stream.synchronize(); + + VELOX_CHECK_NOT_NULL(tbl); + + if (cudfDebugEnabled()) { + std::cout << "CudfFromVelox table number of columns: " << tbl->num_columns() + << std::endl; + std::cout << "CudfFromVelox table number of rows: " << tbl->num_rows() + << std::endl; + } + + // Return a CudfVector that owns the cudf table + auto const size = tbl->num_rows(); + return std::make_shared( + input->pool(), outputType_, size, std::move(tbl), stream); +} + +void CudfFromVelox::close() { + cudf::get_default_stream().synchronize(); + exec::Operator::close(); +} + +CudfToVelox::CudfToVelox( + int32_t operatorId, + RowTypePtr outputType, + exec::DriverCtx* driverCtx, + std::string planNodeId) + : exec::Operator( + driverCtx, + outputType, + operatorId, + planNodeId, + "CudfToVelox"), + NvtxHelper(nvtx3::rgb{148, 0, 211}, operatorId) {} // Purple + +void CudfToVelox::addInput(RowVectorPtr input) { + // Accumulate inputs + if (input->size() > 0) { + auto cudf_input = std::dynamic_pointer_cast(input); + VELOX_CHECK_NOT_NULL(cudf_input); + inputs_.push_back(std::move(cudf_input)); + } +} + +RowVectorPtr CudfToVelox::getOutput() { + VELOX_NVTX_OPERATOR_FUNC_RANGE(); + if (finished_ || inputs_.empty()) { + finished_ = noMoreInput_ && inputs_.empty(); + return nullptr; + } + + auto stream = inputs_.front()->stream(); + std::unique_ptr tbl = inputs_.front()->release(); + inputs_.pop_front(); + + VELOX_CHECK_NOT_NULL(tbl); + if (cudfDebugEnabled()) { + std::cout << "CudfToVelox table number of columns: " << tbl->num_columns() + << std::endl; + std::cout << "CudfToVelox table number of rows: " << tbl->num_rows() + << std::endl; + } + if (tbl->num_rows() == 0) { + return nullptr; + } + RowVectorPtr output = + with_arrow::to_velox_column(tbl->view(), pool(), "", stream); + stream.synchronize(); + finished_ = noMoreInput_ && inputs_.empty(); + output->setType(outputType_); + return output; +} + +void CudfToVelox::close() { + exec::Operator::close(); + // TODO: Release stored inputs if needed + // TODO: Release cudf memory resources +} + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h new file mode 100644 index 00000000000..c75f8464b36 --- /dev/null +++ b/velox/experimental/cudf/exec/CudfConversion.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/exec/Driver.h" +#include "velox/exec/Operator.h" +#include "velox/vector/ComplexVector.h" + +#include + +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/vector/CudfVector.h" + +#include +#include +#include + +namespace facebook::velox::cudf_velox { + +class CudfFromVelox : public exec::Operator, public NvtxHelper { + public: + CudfFromVelox( + int32_t operatorId, + RowTypePtr outputType, + exec::DriverCtx* driverCtx, + std::string planNodeId); + + bool needsInput() const override { + return !finished_; + } + + void addInput(RowVectorPtr input) override; + + RowVectorPtr getOutput() override; + + exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override { + return exec::BlockingReason::kNotBlocked; + } + + bool isFinished() override { + return finished_; + } + + void close() override; + + private: + std::vector inputs_; + std::size_t current_output_size_ = 0; + bool finished_ = false; +}; + +class CudfToVelox : public exec::Operator, public NvtxHelper { + public: + CudfToVelox( + int32_t operatorId, + RowTypePtr outputType, + exec::DriverCtx* driverCtx, + std::string planNodeId); + + bool needsInput() const override { + return !finished_; + } + + void addInput(RowVectorPtr input) override; + + RowVectorPtr getOutput() override; + + exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override { + return exec::BlockingReason::kNotBlocked; + } + + bool isFinished() override { + return finished_; + } + + void close() override; + + private: + std::deque inputs_; + bool finished_ = false; +}; + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp new file mode 100644 index 00000000000..ac32cc8e23f --- /dev/null +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/exec/Driver.h" +#include "velox/exec/Operator.h" +#include "velox/vector/ComplexVector.h" + +#include +#include +#include +#include + +#include "velox/experimental/cudf/exec/CudfOrderBy.h" +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/exec/Utilities.h" +#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" + +namespace facebook::velox::cudf_velox { + +CudfOrderBy::CudfOrderBy( + int32_t operatorId, + exec::DriverCtx* driverCtx, + const std::shared_ptr& orderByNode) + : exec::Operator( + driverCtx, + orderByNode->outputType(), + operatorId, + orderByNode->id(), + "CudfOrderBy"), + NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise + orderByNode_(orderByNode) { + maxOutputRows_ = outputBatchRows(std::nullopt); + sort_keys_.reserve(orderByNode->sortingKeys().size()); + column_order_.reserve(orderByNode->sortingKeys().size()); + null_order_.reserve(orderByNode->sortingKeys().size()); + for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) { + const auto channel = + exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_); + VELOX_CHECK( + channel != kConstantChannel, + "OrderBy doesn't allow constant sorting keys"); + sort_keys_.push_back(channel); + auto const& sorting_order = orderByNode->sortingOrders()[i]; + column_order_.push_back( + sorting_order.isAscending() ? cudf::order::ASCENDING + : cudf::order::DESCENDING); + null_order_.push_back( + (sorting_order.isNullsFirst() ^ !sorting_order.isAscending()) + ? cudf::null_order::BEFORE + : cudf::null_order::AFTER); + } + if (cudfDebugEnabled()) { + std::cout << "Number of Sort keys: " << sort_keys_.size() << std::endl; + } +} + +void CudfOrderBy::addInput(RowVectorPtr input) { + // Accumulate inputs + if (input->size() > 0) { + auto cudf_input = std::dynamic_pointer_cast(input); + VELOX_CHECK_NOT_NULL(cudf_input); + inputs_.push_back(std::move(cudf_input)); + } +} + +void CudfOrderBy::noMoreInput() { + exec::Operator::noMoreInput(); + // TODO: Get total row count, batch output + // maxOutputRows_ = outputBatchRows(total_row_count); + + VELOX_NVTX_OPERATOR_FUNC_RANGE(); + + if (inputs_.empty()) { + return; + } + + auto stream = cudfGlobalStreamPool().get_stream(); + auto tbl = getConcatenatedTable(inputs_, stream); + + // Release input data after synchronizing + stream.synchronize(); + inputs_.clear(); + + VELOX_CHECK_NOT_NULL(tbl); + if (cudfDebugEnabled()) { + std::cout << "Sort input table number of columns: " << tbl->num_columns() + << std::endl; + std::cout << "Sort input table number of rows: " << tbl->num_rows() + << std::endl; + } + + auto keys = tbl->view().select(sort_keys_); + auto values = tbl->view(); + auto result = + cudf::sort_by_key(values, keys, column_order_, null_order_, stream); + auto const size = result->num_rows(); + outputTable_ = std::make_shared( + pool(), outputType_, size, std::move(result), stream); +} + +RowVectorPtr CudfOrderBy::getOutput() { + if (finished_ || !noMoreInput_) { + return nullptr; + } + finished_ = noMoreInput_; + return outputTable_; +} + +void CudfOrderBy::close() { + exec::Operator::close(); + // Release stored inputs + // Release cudf memory resources + inputs_.clear(); + outputTable_.reset(); +} +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h new file mode 100644 index 00000000000..28c89cec8e4 --- /dev/null +++ b/velox/experimental/cudf/exec/CudfOrderBy.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/core/Expressions.h" +#include "velox/core/PlanNode.h" +#include "velox/exec/Driver.h" +#include "velox/exec/Operator.h" +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/vector/CudfVector.h" +#include "velox/vector/ComplexVector.h" + +#include + +namespace facebook::velox::cudf_velox { + +class CudfOrderBy : public exec::Operator, public NvtxHelper { + public: + CudfOrderBy( + int32_t operatorId, + exec::DriverCtx* driverCtx, + const std::shared_ptr& orderByNode); + + bool needsInput() const override { + return !finished_; + } + + void addInput(RowVectorPtr input) override; + + void noMoreInput() override; + + RowVectorPtr getOutput() override; + + exec::BlockingReason isBlocked(ContinueFuture* /*future*/) override { + return exec::BlockingReason::kNotBlocked; + } + + bool isFinished() override { + return finished_; + } + + void close() override; + + private: + CudfVectorPtr outputTable_; + std::shared_ptr orderByNode_; + std::vector inputs_; + std::vector sort_keys_; + std::vector column_order_; + std::vector null_order_; + bool finished_{false}; + uint32_t maxOutputRows_; +}; + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h new file mode 100644 index 00000000000..4e4efca7a08 --- /dev/null +++ b/velox/experimental/cudf/exec/NvtxHelper.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace facebook::velox::cudf_velox { + +class NvtxHelper { + public: + NvtxHelper(); + NvtxHelper(nvtx3::color color, std::optional payload = std::nullopt) + : color_(color), payload_(payload) {} + + nvtx3::color color_{nvtx3::rgb{125, 125, 125}}; // Gray + std::optional payload_{}; +}; + +/** + * @brief Tag type for Velox's NVTX domain. + */ +struct velox_domain { + static constexpr char const* name{"velox"}; +}; + +using nvtx_registered_string_t = nvtx3::registered_string_in; + +#define VELOX_NVTX_OPERATOR_FUNC_RANGE() \ + static_assert( \ + std::is_base_of::type>:: \ + value, \ + "VELOX_NVTX_OPERATOR_FUNC_RANGE can only be used" \ + " in Operators derived from NvtxHelper"); \ + static nvtx_registered_string_t const nvtx3_func_name__{ \ + std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)}; \ + static ::nvtx3::event_attributes const nvtx3_func_attr__{ \ + this->payload_.has_value() ? \ + ::nvtx3::event_attributes{nvtx3_func_name__, this->color_, \ + nvtx3::payload{this->payload_.value()}} : \ + ::nvtx3::event_attributes{nvtx3_func_name__, this->color_}}; \ + ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + +#define VELOX_NVTX_PRETTY_FUNC_RANGE() \ + static nvtx_registered_string_t const nvtx3_func_name__{ \ + std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)}; \ + static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + +#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(velox_domain) + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp new file mode 100644 index 00000000000..03ec29b2c95 --- /dev/null +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/cudf/exec/ToCudf.h" +#include "velox/exec/Driver.h" +#include "velox/exec/FilterProject.h" +#include "velox/exec/HashAggregation.h" +#include "velox/exec/HashBuild.h" +#include "velox/exec/HashProbe.h" +#include "velox/exec/Operator.h" +#include "velox/exec/OrderBy.h" +#include "velox/experimental/cudf/exec/CudfConversion.h" +#include "velox/experimental/cudf/exec/CudfFilterProject.h" +#include "velox/experimental/cudf/exec/CudfHashAggregation.h" +#include "velox/experimental/cudf/exec/CudfHashJoin.h" +#include "velox/experimental/cudf/exec/CudfLocalPartition.h" +#include "velox/experimental/cudf/exec/CudfOrderBy.h" +#include "velox/experimental/cudf/exec/ExpressionEvaluator.h" +#include "velox/experimental/cudf/exec/Utilities.h" + +#include + +#include + +#include + +namespace facebook::velox::cudf_velox { + +template +bool is_any_of(const Base* p) { + return ((dynamic_cast(p) != nullptr) || ...); +} + +static bool _cudfIsRegistered = false; + +bool CompileState::compile() { + if (cudfDebugEnabled()) { + std::cout << "Calling cudfDriverAdapter" << std::endl; + } + + auto operators = driver_.operators(); + auto& nodes = planNodes_; + + if (cudfDebugEnabled()) { + std::cout << "Number of operators: " << operators.size() << std::endl; + for (auto& op : operators) { + std::cout << " Operator: ID " << op->operatorId() << ": " + << op->toString() << std::endl; + } + std::cout << "Number of plan nodes: " << nodes.size() << std::endl; + for (auto& node : nodes) { + std::cout << " Plan node: ID " << node->id() << ": " << node->toString(); + } + } + + // Make sure operator states are initialized. We will need to inspect some of + // them during the transformation. + driver_.initializeOperators(); + + bool replacements_made = false; + auto ctx = driver_.driverCtx(); + + // Get plan node by id lookup. + auto get_plan_node = [&](const core::PlanNodeId& id) { + auto it = + std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) { + return node->id() == id; + }); + VELOX_CHECK(it != nodes.end()); + return *it; + }; + + auto is_filter_project_supported = [](const exec::Operator* op) { + if (auto filter_project_op = dynamic_cast(op)) { + auto info = filter_project_op->exprsAndProjection(); + return !info.hasFilter && + ExpressionEvaluator::can_be_evaluated(info.exprs->exprs()); + } + return false; + }; + + auto is_join_supported = [get_plan_node](const exec::Operator* op) { + if (!is_any_of(op)) { + return false; + } + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(op->planNodeId())); + if (!plan_node) { + return false; + } + if (!plan_node->isInnerJoin()) { + return false; + } + if (plan_node->filter() != nullptr) { + return false; + } + return true; + }; + + auto is_supported_gpu_operator = + [is_filter_project_supported, + is_join_supported](const exec::Operator* op) { + return is_any_of< + exec::OrderBy, + exec::HashAggregation, + exec::LocalPartition, + exec::LocalExchange>(op) || + is_filter_project_supported(op) || is_join_supported(op); + }; + + std::vector is_supported_gpu_operators(operators.size()); + std::transform( + operators.begin(), + operators.end(), + is_supported_gpu_operators.begin(), + is_supported_gpu_operator); + auto accepts_gpu_input = [is_filter_project_supported, + is_join_supported](const exec::Operator* op) { + return is_any_of< + exec::OrderBy, + exec::HashAggregation, + exec::LocalPartition>(op) || + is_filter_project_supported(op) || is_join_supported(op); + }; + auto produces_gpu_output = [is_filter_project_supported, + is_join_supported](const exec::Operator* op) { + return is_any_of( + op) || + is_filter_project_supported(op) || + (is_any_of(op) && is_join_supported(op)); + }; + + int32_t operatorsOffset = 0; + for (int32_t operatorIndex = 0; operatorIndex < operators.size(); + ++operatorIndex) { + std::vector> replace_op; + + exec::Operator* oper = operators[operatorIndex]; + auto replacingOperatorIndex = operatorIndex + operatorsOffset; + VELOX_CHECK(oper); + + bool const previous_operator_is_not_gpu = + (operatorIndex > 0 and !is_supported_gpu_operators[operatorIndex - 1]); + bool const next_operator_is_not_gpu = + (operatorIndex < operators.size() - 1 and + !is_supported_gpu_operators[operatorIndex + 1]); + + auto id = oper->operatorId(); + if (previous_operator_is_not_gpu and accepts_gpu_input(oper)) { + auto plan_node = get_plan_node(oper->planNodeId()); + replace_op.push_back(std::make_unique( + id, plan_node->outputType(), ctx, plan_node->id() + "-from-velox")); + replace_op.back()->initialize(); + } + + // This is used to denote if the current operator is kept or replaced. + auto keep_operator = 0; + if (is_join_supported(oper)) { + if (auto joinBuildOp = dynamic_cast(oper)) { + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(joinBuildOp->planNodeId())); + VELOX_CHECK(plan_node != nullptr); + // From-Velox (optional) + replace_op.push_back( + std::make_unique(id, ctx, plan_node)); + replace_op.back()->initialize(); + } else if (auto joinProbeOp = dynamic_cast(oper)) { + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(joinProbeOp->planNodeId())); + VELOX_CHECK(plan_node != nullptr); + // From-Velox (optional) + replace_op.push_back( + std::make_unique(id, ctx, plan_node)); + replace_op.back()->initialize(); + // To-Velox (optional) + } + } else if (auto orderByOp = dynamic_cast(oper)) { + auto id = orderByOp->operatorId(); + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(orderByOp->planNodeId())); + VELOX_CHECK(plan_node != nullptr); + // From-velox (optional) + replace_op.push_back(std::make_unique(id, ctx, plan_node)); + replace_op.back()->initialize(); + // To-velox (optional) + } else if (auto hashAggOp = dynamic_cast(oper)) { + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(hashAggOp->planNodeId())); + VELOX_CHECK(plan_node != nullptr); + replace_op.push_back( + std::make_unique(id, ctx, plan_node)); + replace_op.back()->initialize(); + } else if (is_filter_project_supported(oper)) { + auto filterProjectOp = dynamic_cast(oper); + auto info = filterProjectOp->exprsAndProjection(); + auto& id_projections = filterProjectOp->identityProjections(); + auto plan_node = std::dynamic_pointer_cast( + get_plan_node(filterProjectOp->planNodeId())); + // If filter doesn't exist then project should definitely exist so this + // should never hit + VELOX_CHECK(plan_node != nullptr); + replace_op.push_back(std::make_unique( + id, ctx, info, id_projections, nullptr, plan_node)); + replace_op.back()->initialize(); + } else if ( + auto localPartitionOp = dynamic_cast(oper)) { + auto plan_node = + std::dynamic_pointer_cast( + get_plan_node(localPartitionOp->planNodeId())); + VELOX_CHECK(plan_node != nullptr); + replace_op.push_back( + std::make_unique(id, ctx, plan_node)); + replace_op.back()->initialize(); + } + + if (next_operator_is_not_gpu and produces_gpu_output(oper)) { + auto plan_node = get_plan_node(oper->planNodeId()); + replace_op.push_back(std::make_unique( + id, plan_node->outputType(), ctx, plan_node->id() + "-to-velox")); + replace_op.back()->initialize(); + } + + if (not replace_op.empty()) { + operatorsOffset += + replace_op.size() - 1 + keep_operator; // Check this "- 1" + [[maybe_unused]] auto replaced = driverFactory_.replaceOperators( + driver_, + replacingOperatorIndex + keep_operator, + replacingOperatorIndex + 1, + std::move(replace_op)); + replacements_made = true; + } + } + + if (cudfDebugEnabled()) { + operators = driver_.operators(); + std::cout << "Number of new operators: " << operators.size() << std::endl; + for (auto& op : operators) { + std::cout << " Operator: ID " << op->operatorId() << ": " + << op->toString() << std::endl; + } + } + return replacements_made; +} + +struct cudfDriverAdapter { + std::shared_ptr mr_; + std::shared_ptr>> + planNodes_; + + cudfDriverAdapter(std::shared_ptr mr) + : mr_(mr) { + if (cudfDebugEnabled()) { + std::cout << "cudfDriverAdapter constructor" << std::endl; + } + planNodes_ = + std::make_shared>>(); + } + + ~cudfDriverAdapter() { + if (cudfDebugEnabled()) { + std::cout << "cudfDriverAdapter destructor" << std::endl; + printf( + "cached planNodes_ %p, %ld\n", + planNodes_.get(), + planNodes_.use_count()); + } + } + + // Call operator needed by DriverAdapter + bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) { + auto state = CompileState(factory, driver, *planNodes_); + // Stored planNodes_ from inspect. + if (cudfDebugEnabled()) { + printf("driver.planNodes_=%p\n", planNodes_.get()); + } + auto res = state.compile(); + return res; + } + + // Iterate recursively and store them in the planNodes_. + void storePlanNodes(const std::shared_ptr& planNode) { + const auto& sources = planNode->sources(); + for (int32_t i = 0; i < sources.size(); ++i) { + storePlanNodes(sources[i]); + } + planNodes_->push_back(planNode); + } + + // Call operator needed by plan inspection + void operator()(const core::PlanFragment& planFragment) { + // signature: std::function inspect; + // call: adapter.inspect(planFragment); + planNodes_->clear(); + if (cudfDebugEnabled()) { + std::cout << "Inspecting PlanFragment" << std::endl; + } + if (planNodes_) { + storePlanNodes(planFragment.planNode); + } + } +}; + +void registerCudf() { + const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED"); + if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) { + return; + } + + CUDF_FUNC_RANGE(); + cudaFree(0); // to init context. + + if (cudfDebugEnabled()) { + std::cout << "Registering CudfHashJoinBridgeTranslator" << std::endl; + } + exec::Operator::registerOperator( + std::make_unique()); + if (cudfDebugEnabled()) { + std::cout << "Registering cudfDriverAdapter" << std::endl; + } + + const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); + auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async"; + if (cudfDebugEnabled()) { + std::cout << "Setting cuDF memory resource to " << mr_mode << std::endl; + } + auto mr = cudf_velox::create_memory_resource(mr_mode); + cudf::set_current_device_resource(mr.get()); + cudfDriverAdapter cda{mr}; + exec::DriverAdapter cudfAdapter{"cuDF", cda, cda}; + exec::DriverFactory::registerAdapter(cudfAdapter); + _cudfIsRegistered = true; +} + +void unregisterCudf() { + if (cudfDebugEnabled()) { + std::cout << "Unregistering cudfDriverAdapter" << std::endl; + } + exec::DriverFactory::adapters.clear(); + _cudfIsRegistered = false; +} + +bool cudfIsRegistered() { + return _cudfIsRegistered; +} + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h new file mode 100644 index 00000000000..8da0eba26ae --- /dev/null +++ b/velox/experimental/cudf/exec/ToCudf.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/exec/Driver.h" +#include "velox/exec/Operator.h" + +namespace facebook::velox::cudf_velox { + +class CompileState { + public: + CompileState( + const exec::DriverFactory& driverFactory, + exec::Driver& driver, + std::vector>& planNodes) + : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {} + + exec::Driver& driver() { + return driver_; + } + + // Replaces sequences of Operators in the Driver given at construction with + // cuDF equivalents. Returns true if the Driver was changed. + bool compile(); + + const exec::DriverFactory& driverFactory_; + exec::Driver& driver_; + const std::vector>& planNodes_; +}; + +/// Registers adapter to add cuDF operators to Drivers. +void registerCudf(); +void unregisterCudf(); + +/// Returns true if cuDF is registered. +bool cudfIsRegistered(); + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp new file mode 100644 index 00000000000..f12b08a341c --- /dev/null +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "velox/experimental/cudf/exec/Utilities.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace facebook::velox::cudf_velox { + +namespace { +auto make_cuda_mr() { + return std::make_shared(); +} + +auto make_pool_mr() { + return rmm::mr::make_owning_wrapper( + make_cuda_mr(), rmm::percent_of_free_device_memory(50)); +} + +auto make_async_mr() { + return std::make_shared(); +} + +auto make_managed_mr() { + return std::make_shared(); +} + +auto make_arena_mr() { + return rmm::mr::make_owning_wrapper( + make_cuda_mr()); +} + +auto make_managed_pool_mr() { + return rmm::mr::make_owning_wrapper( + make_managed_mr(), rmm::percent_of_free_device_memory(50)); +} +} // namespace + +std::shared_ptr create_memory_resource( + std::string_view mode) { + if (mode == "cuda") + return make_cuda_mr(); + if (mode == "pool") + return make_pool_mr(); + if (mode == "async") + return make_async_mr(); + if (mode == "arena") + return make_arena_mr(); + if (mode == "managed") + return make_managed_mr(); + if (mode == "managed_pool") + return make_managed_pool_mr(); + throw cudf::logic_error( + "Unknown memory resource mode: " + std::string(mode) + + "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); +} + +cudf::detail::cuda_stream_pool& cudfGlobalStreamPool() { + return cudf::detail::global_cuda_stream_pool(); +}; + +bool cudfDebugEnabled() { + const char* env_cudf_debug = std::getenv("VELOX_CUDF_DEBUG"); + return env_cudf_debug != nullptr && std::stoi(env_cudf_debug); +} + +std::unique_ptr concatenateTables( + std::vector> tables, + rmm::cuda_stream_view stream) { + // Check for empty vector + VELOX_CHECK_GT(tables.size(), 0); + + if (tables.size() == 1) { + return std::move(tables[0]); + } + std::vector tableViews; + tableViews.reserve(tables.size()); + std::transform( + tables.begin(), + tables.end(), + std::back_inserter(tableViews), + [&](auto const& tbl) { return tbl->view(); }); + return cudf::concatenate( + tableViews, stream, cudf::get_current_device_resource_ref()); +} + +std::unique_ptr getConcatenatedTable( + std::vector& tables, + rmm::cuda_stream_view stream) { + // Check for empty vector + VELOX_CHECK_GT(tables.size(), 0); + + auto inputStreams = std::vector(); + auto tableViews = std::vector(); + + inputStreams.reserve(tables.size()); + tableViews.reserve(tables.size()); + + for (auto const& table : tables) { + VELOX_CHECK_NOT_NULL(table); + tableViews.push_back(table->getTableView()); + inputStreams.push_back(table->stream()); + } + + cudf::detail::join_streams(inputStreams, stream); + + if (tables.size() == 1) { + return tables[0]->release(); + } + + auto output = cudf::concatenate( + tableViews, stream, cudf::get_current_device_resource_ref()); + stream.synchronize(); + return output; +} + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h new file mode 100644 index 00000000000..87a496a8cff --- /dev/null +++ b/velox/experimental/cudf/exec/Utilities.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "velox/experimental/cudf/vector/CudfVector.h" + +#include +#include +#include + +namespace facebook::velox::cudf_velox { + +/** + * @brief Creates a memory resource based on the given mode. + */ +[[nodiscard]] std::shared_ptr +create_memory_resource(std::string_view mode); + +/** + * @brief Returns the global CUDA stream pool used by cudf. + */ +[[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool(); + +/** + * @brief Returns true if the VELOX_CUDF_DEBUG environment variable is set to a + * nonzero value. + */ +bool cudfDebugEnabled(); + +// Concatenate a vector of cuDF tables into a single table +std::unique_ptr concatenateTables( + std::vector> tables, + rmm::cuda_stream_view stream); + +// Concatenate a vector of cuDF tables into a single table. +// This function joins the streams owned by individual tables on the passed +// stream. Inputs are not safe to use after calling this function. +std::unique_ptr getConcatenatedTable( + std::vector& tables, + rmm::cuda_stream_view stream); + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp new file mode 100644 index 00000000000..3fd2b046787 --- /dev/null +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -0,0 +1,508 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/memory/Memory.h" +#include "velox/type/Type.h" +#include "velox/vector/BaseVector.h" +#include "velox/vector/ComplexVector.h" +#include "velox/vector/DictionaryVector.h" +#include "velox/vector/FlatVector.h" +#include "velox/vector/arrow/Bridge.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/exec/Utilities.h" +#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" + +#include +#include + +#include +#include +#include + +namespace facebook::velox::cudf_velox { + +namespace { + +template +constexpr decltype(auto) +vector_encoding_dispatcher(VectorPtr vec, Functor f, Ts&&... args) { + using facebook::velox::VectorEncoding::Simple; + switch (vec->encoding()) { + case Simple::FLAT: + return f(vec->as>(), std::forward(args)...); + case Simple::DICTIONARY: + return f(vec->as>(), std::forward(args)...); + default: { + if (cudfDebugEnabled()) { + std::cout << "Unsupported Velox encoding: " << vec->encoding() + << std::endl; + } + CUDF_FAIL("Unsupported Velox encoding"); + } + } +} + +// TODO: dispatch other duration/timestamp types! +template +using cudf_storage_type_t = std::conditional_t< + std::is_same_v, + cudf::timestamp_D::rep, + cudf::device_storage_type_t>; + +} // namespace + +cudf::type_id velox_to_cudf_type_id(const TypePtr& type) { + if (cudfDebugEnabled()) { + std::cout << "Converting Velox type " << type->toString() << " to cudf" + << std::endl; + } + switch (type->kind()) { + case TypeKind::BOOLEAN: + return cudf::type_id::BOOL8; + case TypeKind::TINYINT: + return cudf::type_id::INT8; + case TypeKind::SMALLINT: + return cudf::type_id::INT16; + case TypeKind::INTEGER: + // TODO: handle interval types (durations?) + // if (type->isIntervalYearMonth()) { + // return cudf::type_id::...; + // } + if (type->isDate()) { + return cudf::type_id::TIMESTAMP_DAYS; + } + return cudf::type_id::INT32; + case TypeKind::BIGINT: + return cudf::type_id::INT64; + case TypeKind::REAL: + return cudf::type_id::FLOAT32; + case TypeKind::DOUBLE: + return cudf::type_id::FLOAT64; + case TypeKind::VARCHAR: + return cudf::type_id::STRING; + case TypeKind::VARBINARY: + return cudf::type_id::STRING; + case TypeKind::TIMESTAMP: + return cudf::type_id::TIMESTAMP_NANOSECONDS; + // case TypeKind::HUGEINT: return cudf::type_id::DURATION_DAYS; + // TODO: DATE was converted to a logical type: + // https://github.com/facebookincubator/velox/commit/e480f5c03a6c47897ef4488bd56918a89719f908 + // case TypeKind::DATE: return cudf::type_id::DURATION_DAYS; + // case TypeKind::INTERVAL_DAY_TIME: return cudf::type_id::EMPTY; + // TODO: Decimals are now logical types: + // https://github.com/facebookincubator/velox/commit/73d2f935b55f084d30557c7be94b9768efb8e56f + // case TypeKind::SHORT_DECIMAL: return cudf::type_id::DECIMAL64; + // case TypeKind::LONG_DECIMAL: return cudf::type_id::DECIMAL128; + // case TypeKind::ARRAY: return cudf::type_id::EMPTY; + // case TypeKind::MAP: return cudf::type_id::EMPTY; + case TypeKind::ROW: + return cudf::type_id::STRUCT; + // case TypeKind::UNKNOWN: return cudf::type_id::EMPTY; + // case TypeKind::FUNCTION: return cudf::type_id::EMPTY; + // case TypeKind::OPAQUE: return cudf::type_id::EMPTY; + // case TypeKind::INVALID: return cudf::type_id::EMPTY; + default: + CUDF_FAIL("Unsupported Velox type"); + return cudf::type_id::EMPTY; + } +} + +TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id) { + switch (type_id) { + case cudf::type_id::BOOL8: + return BOOLEAN(); + case cudf::type_id::INT8: + return TINYINT(); + case cudf::type_id::INT16: + return SMALLINT(); + case cudf::type_id::INT32: + return INTEGER(); + case cudf::type_id::INT64: + return BIGINT(); + case cudf::type_id::FLOAT32: + return REAL(); + case cudf::type_id::FLOAT64: + return DOUBLE(); + case cudf::type_id::STRING: + return VARCHAR(); + case cudf::type_id::TIMESTAMP_DAYS: + return DATE(); + case cudf::type_id::TIMESTAMP_NANOSECONDS: + return TIMESTAMP(); + // TODO: DATE is now a logical type + // case cudf::type_id::DURATION_DAYS: return ???; + // case cudf::type_id::EMPTY: return TypeKind::INTERVAL_DAY_TIME; + // TODO: DECIMAL is now a logical type + // case cudf::type_id::DECIMAL64: return TypeKind::SHORT_DECIMAL; + // case cudf::type_id::DECIMAL128: return TypeKind::LONG_DECIMAL; + // case cudf::type_id::EMPTY: return TypeKind::ARRAY; + // case cudf::type_id::EMPTY: return TypeKind::MAP; + // case cudf::type_id::STRUCT: + // // TODO: Need parametric type support? + // return ROW(); + // case cudf::type_id::EMPTY: return TypeKind::OPAQUE; + // case cudf::type_id::EMPTY: return TypeKind::UNKNOWN; + default: + return UNKNOWN(); + } +} + +// Convert a Velox vector to a CUDF column +struct copy_to_device { + rmm::cuda_stream_view stream; + + // Fixed width types + template < + typename T, + std::enable_if_t()>* = nullptr> + std::unique_ptr operator()(VectorPtr const& h_vec) const { + VELOX_CHECK_NOT_NULL(h_vec); + using velox_T = cudf_storage_type_t; + if (cudfDebugEnabled()) { + std::cout << "Converting fixed width column" << std::endl; + std::cout << "Encoding: " << h_vec->encoding() << std::endl; + std::cout << "Type: " << h_vec->type()->toString() << std::endl; + std::cout << "velox_T: " << typeid(velox_T{}).name() << std::endl; + } + auto velox_data = h_vec->as>(); + VELOX_CHECK_NOT_NULL(velox_data); + auto velox_data_ptr = velox_data->rawValues(); + cudf::host_span velox_host_span( + velox_data_ptr, int{h_vec->size()}); + auto d_v = cudf::detail::make_device_uvector_sync( + velox_host_span, stream, rmm::mr::get_current_device_resource()); + return std::make_unique( + std::move(d_v), rmm::device_buffer{}, 0); + } + + // Strings + template < + typename T, + std::enable_if_t>* = nullptr> + std::unique_ptr operator()(VectorPtr const& h_vec) const { + if (cudfDebugEnabled()) { + std::cout << "Converting string column" << std::endl; + } + + auto const num_rows = h_vec->size(); + auto h_offsets = std::vector(num_rows + 1); + h_offsets[0] = 0; + auto make_offsets = [&](auto const& vec) { + VELOX_CHECK_NOT_NULL(vec); + if (cudfDebugEnabled()) { + std::cout << "Starting offset calculation" << std::endl; + } + for (auto i = 0; i < num_rows; i++) { + h_offsets[i + 1] = h_offsets[i] + vec->valueAt(i).size(); + } + }; + vector_encoding_dispatcher(h_vec, make_offsets); + + auto d_offsets = cudf::detail::make_device_uvector_sync( + h_offsets, stream, rmm::mr::get_current_device_resource()); + + auto chars_size = h_offsets[num_rows]; + auto h_chars = std::vector(chars_size); + + auto make_chars = [&](auto vec) { + VELOX_CHECK_NOT_NULL(vec); + for (auto i = 0; i < num_rows; i++) { + auto const string_view = vec->valueAt(i); + auto const size = string_view.size(); + auto const offset = h_offsets[i]; + std::copy( + string_view.data(), + string_view.data() + size, + h_chars.begin() + offset); + } + }; + vector_encoding_dispatcher(h_vec, make_chars); + + auto d_chars = cudf::detail::make_device_uvector_sync( + h_chars, stream, rmm::mr::get_current_device_resource()); + + return cudf::make_strings_column( + num_rows, + std::make_unique( + std::move(d_offsets), rmm::device_buffer{}, 0), + d_chars.release(), + 0, + rmm::device_buffer{}); + } + + template < + typename T, + typename... Args, + std::enable_if_t< + not(cudf::is_rep_layout_compatible() or + std::is_same_v)>* = nullptr> + std::unique_ptr operator()(VectorPtr const& h_vec) const { + if (cudfDebugEnabled()) { + std::string error_message = "Unsupported type for to_cudf conversion: "; + error_message += h_vec->type()->toString(); + std::cout << error_message << std::endl; + } + CUDF_FAIL("Unsupported type for to_cudf conversion"); + } +}; + +// Row vector to table +// Vector to column +// template +std::unique_ptr to_cudf_table(const RowVectorPtr& leftBatch) { + VELOX_NVTX_FUNC_RANGE(); + // cudf type dispatcher to copy data from velox vector to cudf column + using cudf_col_ptr = std::unique_ptr; + std::vector cudf_columns; + auto copier = copy_to_device{cudf::get_default_stream()}; + for (auto const& h_vec : leftBatch->children()) { + auto cudf_kind = cudf::data_type{velox_to_cudf_type_id(h_vec->type())}; + auto cudf_column = cudf::type_dispatcher(cudf_kind, copier, h_vec); + cudf_columns.push_back(std::move(cudf_column)); + } + return std::make_unique(std::move(cudf_columns)); +} + +// Convert a CUDF column to a Velox vector +struct copy_to_host { + rmm::cuda_stream_view stream; + memory::MemoryPool* pool_; + + template + static constexpr bool is_supported() { + // return cudf::is_rep_layout_compatible(); + return cudf::is_numeric() and not std::is_same::value; + } + + // Fixed width types + template ()>* = nullptr> + VectorPtr operator()(TypePtr velox_type, cudf::column_view const& col) const { + auto velox_buffer = AlignedBuffer::allocate(col.size(), pool_); + auto velox_col = std::make_shared>( + pool_, + velox_type, + nullptr, + col.size(), + velox_buffer, + std::vector{}); + auto velox_data_ptr = velox_col->mutableRawValues(); + CUDF_CUDA_TRY(cudaMemcpyAsync( + velox_data_ptr, + col.data(), + col.size() * sizeof(T), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + return velox_col; + } + + template < + typename T, + typename... Args, + std::enable_if_t()>* = nullptr> + VectorPtr operator()(Args... args) const { + CUDF_FAIL("Unsupported type for to_velox conversion"); + } +}; + +VectorPtr to_velox_column( + const cudf::column_view& col, + memory::MemoryPool* pool) { + VELOX_NVTX_PRETTY_FUNC_RANGE(); + auto velox_type = cudf_type_id_to_velox_type(col.type().id()); + if (cudfDebugEnabled()) { + std::cout << "Converting to_velox_column: " << velox_type->toString() + << std::endl; + } + // cudf type dispatcher to copy data from cudf column to velox vector + auto copier = copy_to_host{cudf::get_default_stream(), pool}; + return cudf::type_dispatcher(col.type(), copier, velox_type, col); +} + +RowVectorPtr to_velox_column( + const cudf::table_view& table, + memory::MemoryPool* pool, + std::string name_prefix) { + VELOX_NVTX_PRETTY_FUNC_RANGE(); + std::vector children; + std::vector childNames; + std::vector> childTypes; + children.reserve(table.num_columns()); + childNames.reserve(table.num_columns()); + for (auto& col : table) { + children.push_back(to_velox_column(col, pool)); + childNames.push_back(name_prefix + std::to_string(childNames.size())); + } + + childTypes.reserve(children.size()); + for (const auto& child : children) { + childTypes.push_back(child->type()); + } + auto rowType = ROW(std::move(childNames), std::move(childTypes)); + const size_t vectorSize = children.empty() ? 0 : children.front()->size(); + + return std::make_shared( + pool, rowType, BufferPtr(nullptr), vectorSize, children); +} + +namespace with_arrow { + +std::unique_ptr to_cudf_table( + const facebook::velox::RowVectorPtr& veloxTable, // BaseVector or RowVector? + facebook::velox::memory::MemoryPool* pool, + rmm::cuda_stream_view stream) { + // Need to flattenDictionary and flattenConstant, otherwise we observe issues + // in the null mask. + ArrowOptions arrowOptions{true, true}; + ArrowArray arrowArray; + exportToArrow( + std::dynamic_pointer_cast(veloxTable), + arrowArray, + pool, + arrowOptions); + ArrowSchema arrowSchema; + exportToArrow( + std::dynamic_pointer_cast(veloxTable), + arrowSchema, + arrowOptions); + auto tbl = cudf::from_arrow(&arrowSchema, &arrowArray, stream); + + // Release Arrow resources + if (arrowArray.release) { + arrowArray.release(&arrowArray); + } + if (arrowSchema.release) { + arrowSchema.release(&arrowSchema); + } + return tbl; +} + +namespace { + +void to_signed_int_format(char* format) { + VELOX_CHECK_NOT_NULL(format); + switch (format[0]) { + case 'C': + format[0] = 'c'; + break; + case 'S': + format[0] = 's'; + break; + case 'I': + format[0] = 'i'; + break; + case 'L': + format[0] = 'l'; + break; + default: + return; + } + printf( + "Warning: arrowSchema.format: %s, unsigned is treated as signed indices\n", + format); +} + +// Changes all unsigned indices to signed indices for dictionary columns from +// cudf which uses unsigned indices, but velox uses signed indices. +void fix_dictionary_indices(ArrowSchema& arrowSchema) { + if (arrowSchema.dictionary != nullptr) { + to_signed_int_format(const_cast(arrowSchema.format)); + fix_dictionary_indices(*arrowSchema.dictionary); + } + for (size_t i = 0; i < arrowSchema.n_children; ++i) { + VELOX_CHECK_NOT_NULL(arrowSchema.children[i]); + fix_dictionary_indices(*arrowSchema.children[i]); + } +} + +RowVectorPtr to_velox_column( + const cudf::table_view& table, + memory::MemoryPool* pool, + const std::vector& metadata, + rmm::cuda_stream_view stream) { + auto arrowDeviceArray = cudf::to_arrow_host(table, stream); + auto& arrowArray = arrowDeviceArray->array; + + auto arrowSchema = cudf::to_arrow_schema(table, metadata); + // Hack to convert unsigned indices to signed indices for dictionary columns + fix_dictionary_indices(*arrowSchema); + + auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool); + // BaseVector to RowVector + auto casted_ptr = + std::dynamic_pointer_cast(veloxTable); + VELOX_CHECK_NOT_NULL(casted_ptr); + return casted_ptr; +} + +template +std::vector +get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) { + std::vector metadata; + int i = 0; + for (auto c = begin; c < end; c++) { + metadata.push_back(cudf::column_metadata(name_prefix + std::to_string(i))); + metadata.back().children_meta = get_metadata( + c->child_begin(), c->child_end(), name_prefix + std::to_string(i)); + i++; + } + return metadata; +} + +} // namespace + +facebook::velox::RowVectorPtr to_velox_column( + const cudf::table_view& table, + facebook::velox::memory::MemoryPool* pool, + std::string name_prefix, + rmm::cuda_stream_view stream) { + auto metadata = get_metadata(table.begin(), table.end(), name_prefix); + return to_velox_column(table, pool, metadata, stream); +} + +RowVectorPtr to_velox_column( + const cudf::table_view& table, + memory::MemoryPool* pool, + const std::vector& columnNames, + rmm::cuda_stream_view stream) { + std::vector metadata; + for (auto name : columnNames) { + metadata.emplace_back(cudf::column_metadata(name)); + } + return to_velox_column(table, pool, metadata, stream); +} + +} // namespace with_arrow +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h new file mode 100644 index 00000000000..4bd88c1a125 --- /dev/null +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/memory/Memory.h" +#include "velox/vector/BaseVector.h" +#include "velox/vector/ComplexVector.h" + +#include +#include +#include + +namespace facebook::velox::cudf_velox { + +cudf::type_id velox_to_cudf_type_id(const TypePtr& type); +TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id); + +[[deprecated( + "Use with_arrow::to_cudf_table instead")]] std::unique_ptr +to_cudf_table(const facebook::velox::RowVectorPtr& leftBatch); +facebook::velox::VectorPtr to_velox_column( + const cudf::column_view& col, + facebook::velox::memory::MemoryPool* pool); +[[deprecated( + "Use with_arrow::to_velox_column instead")]] facebook::velox::RowVectorPtr +to_velox_column( + const cudf::table_view& table, + facebook::velox::memory::MemoryPool* pool, + std::string name_prefix = "c"); + +namespace with_arrow { +std::unique_ptr to_cudf_table( + const facebook::velox::RowVectorPtr& veloxTable, + facebook::velox::memory::MemoryPool* pool, + rmm::cuda_stream_view stream); + +facebook::velox::RowVectorPtr to_velox_column( + const cudf::table_view& table, + facebook::velox::memory::MemoryPool* pool, + std::string name_prefix, + rmm::cuda_stream_view stream); + +facebook::velox::RowVectorPtr to_velox_column( + const cudf::table_view& table, + facebook::velox::memory::MemoryPool* pool, + const std::vector& columnNames, + rmm::cuda_stream_view stream); +} // namespace with_arrow + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/tests/CMakeLists.txt b/velox/experimental/cudf/tests/CMakeLists.txt new file mode 100644 index 00000000000..1847f00cd6d --- /dev/null +++ b/velox/experimental/cudf/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_cudf_order_by_test Main.cpp OrderByTest.cpp) + +add_test( + NAME velox_cudf_order_by_test + COMMAND velox_cudf_order_by_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + +set_tests_properties(velox_cudf_order_by_test PROPERTIES LABELS cuda_driver + TIMEOUT 3000) + +target_link_libraries( + velox_cudf_order_by_test + velox_cudf_exec + velox_exec + velox_exec_test_lib + velox_test_util + gtest + gtest_main + fmt::fmt) diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp new file mode 100644 index 00000000000..ff47198a3a8 --- /dev/null +++ b/velox/experimental/cudf/tests/OrderByTest.cpp @@ -0,0 +1,417 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/core/QueryConfig.h" +#include "velox/dwio/common/tests/utils/BatchMaker.h" +#include "velox/exec/PlanNodeStats.h" +#include "velox/exec/tests/utils/AssertQueryBuilder.h" +#include "velox/exec/tests/utils/OperatorTestBase.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/experimental/cudf/exec/ToCudf.h" +#include "velox/experimental/cudf/exec/Utilities.h" + +using namespace facebook::velox; +using namespace facebook::velox::exec; +using namespace facebook::velox::exec::test; +using namespace facebook::velox::common::testutil; + +using facebook::velox::test::BatchMaker; +namespace { + +class OrderByTest : public OperatorTestBase { + protected: + void SetUp() override { + OperatorTestBase::SetUp(); + filesystems::registerLocalFileSystem(); + cudf_velox::registerCudf(); + rng_.seed(123); + + rowType_ = ROW( + {{"c0", INTEGER()}, + {"c1", INTEGER()}, + {"c2", VARCHAR()}, + {"c3", VARCHAR()}}); + } + + void TearDown() override { + cudf_velox::unregisterCudf(); + OperatorTestBase::TearDown(); + } + + void testSingleKey( + const std::vector& input, + const std::string& key) { + core::PlanNodeId orderById; + auto keyIndex = input[0]->type()->asRow().getChildIdx(key); + auto plan = PlanBuilder() + .values(input) + .orderBy({fmt::format("{} ASC NULLS LAST", key)}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + fmt::format("SELECT * FROM tmp ORDER BY {} NULLS LAST", key), + {keyIndex}); + + plan = PlanBuilder() + .values(input) + .orderBy({fmt::format("{} DESC NULLS FIRST", key)}, false) + .planNode(); + runTest( + plan, + orderById, + fmt::format("SELECT * FROM tmp ORDER BY {} DESC NULLS FIRST", key), + {keyIndex}); + } + + void testSingleKey( + const std::vector& input, + const std::string& key, + const std::string& filter) { + core::PlanNodeId orderById; + auto keyIndex = input[0]->type()->asRow().getChildIdx(key); + auto plan = PlanBuilder() + .values(input) + .filter(filter) + .orderBy({fmt::format("{} ASC NULLS LAST", key)}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + fmt::format( + "SELECT * FROM tmp WHERE {} ORDER BY {} NULLS LAST", filter, key), + {keyIndex}); + + plan = PlanBuilder() + .values(input) + .filter(filter) + .orderBy({fmt::format("{} DESC NULLS FIRST", key)}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + fmt::format( + "SELECT * FROM tmp WHERE {} ORDER BY {} DESC NULLS FIRST", + filter, + key), + {keyIndex}); + } + + void testTwoKeys( + const std::vector& input, + const std::string& key1, + const std::string& key2) { + auto& rowType = input[0]->type()->asRow(); + auto keyIndices = {rowType.getChildIdx(key1), rowType.getChildIdx(key2)}; + + std::vector sortOrders = { + core::kAscNullsLast, core::kDescNullsFirst}; + std::vector sortOrderSqls = {"NULLS LAST", "DESC NULLS FIRST"}; + + for (int i = 0; i < sortOrders.size(); i++) { + for (int j = 0; j < sortOrders.size(); j++) { + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values(input) + .orderBy( + {fmt::format("{} {}", key1, sortOrderSqls[i]), + fmt::format("{} {}", key2, sortOrderSqls[j])}, + false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + fmt::format( + "SELECT * FROM tmp ORDER BY {} {}, {} {}", + key1, + sortOrderSqls[i], + key2, + sortOrderSqls[j]), + keyIndices); + } + } + } + + void runTest( + core::PlanNodePtr planNode, + const core::PlanNodeId& orderById, + const std::string& duckDbSql, + const std::vector& sortingKeys) { + { + SCOPED_TRACE("run without spilling"); + assertQueryOrdered(planNode, duckDbSql, sortingKeys); + } + } + + std::vector makeVectors( + const RowTypePtr& rowType, + int32_t numVectors, + int32_t rowsPerVector) { + std::vector vectors; + for (int32_t i = 0; i < numVectors; ++i) { + auto vector = std::dynamic_pointer_cast( + facebook::velox::test::BatchMaker::createBatch( + rowType, rowsPerVector, *pool_)); + vectors.push_back(vector); + } + return vectors; + } + + folly::Random::DefaultGenerator rng_; + RowTypePtr rowType_; +}; + +TEST_F(OrderByTest, selectiveFilter) { + vector_size_t batchSize = 1000; + std::vector vectors; + for (int32_t i = 0; i < 3; ++i) { + auto c0 = makeFlatVector( + batchSize, + [&](vector_size_t row) { return batchSize * i + row; }, + nullEvery(5)); + auto c1 = makeFlatVector( + batchSize, [&](vector_size_t row) { return row; }, nullEvery(5)); + auto c2 = makeFlatVector( + batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11)); + vectors.push_back(makeRowVector({c0, c1, c2})); + } + createDuckDbTable(vectors); + + // c0 values are unique across batches + testSingleKey(vectors, "c0", "c0 % 333 = 0"); + + // c1 values are unique only within a batch + testSingleKey(vectors, "c1", "c1 % 333 = 0"); +} + +TEST_F(OrderByTest, singleKey) { + vector_size_t batchSize = 1000; + std::vector vectors; + for (int32_t i = 0; i < 2; ++i) { + auto c0 = makeFlatVector( + batchSize, [&](vector_size_t row) { return row; }, nullEvery(5)); + auto c1 = makeFlatVector( + batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11)); + vectors.push_back(makeRowVector({c0, c1})); + } + createDuckDbTable(vectors); + + testSingleKey(vectors, "c0"); + + // parser doesn't support "is not null" expression, hence, using c0 % 2 >= 0 + testSingleKey(vectors, "c0", "c0 % 2 >= 0"); + + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values(vectors) + .orderBy({"c0 DESC NULLS LAST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, orderById, "SELECT * FROM tmp ORDER BY c0 DESC NULLS LAST", {0}); + + plan = PlanBuilder() + .values(vectors) + .orderBy({"c0 ASC NULLS FIRST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest(plan, orderById, "SELECT * FROM tmp ORDER BY c0 NULLS FIRST", {0}); +} + +TEST_F(OrderByTest, multipleKeys) { + vector_size_t batchSize = 1000; + std::vector vectors; + for (int32_t i = 0; i < 2; ++i) { + // c0: half of rows are null, a quarter is 0 and remaining quarter is 1 + auto c0 = makeFlatVector( + batchSize, [](vector_size_t row) { return row % 4; }, nullEvery(2, 1)); + auto c1 = makeFlatVector( + batchSize, [](vector_size_t row) { return row; }, nullEvery(7)); + auto c2 = makeFlatVector( + batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11)); + vectors.push_back(makeRowVector({c0, c1, c2})); + } + createDuckDbTable(vectors); + + testTwoKeys(vectors, "c0", "c1"); + + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values(vectors) + .orderBy({"c0 ASC NULLS FIRST", "c1 ASC NULLS LAST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + "SELECT * FROM tmp ORDER BY c0 NULLS FIRST, c1 NULLS LAST", + {0, 1}); + + plan = PlanBuilder() + .values(vectors) + .orderBy({"c0 DESC NULLS LAST", "c1 DESC NULLS FIRST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + "SELECT * FROM tmp ORDER BY c0 DESC NULLS LAST, c1 DESC NULLS FIRST", + {0, 1}); +} + +TEST_F(OrderByTest, multiBatchResult) { + vector_size_t batchSize = 5000; + std::vector vectors; + for (int32_t i = 0; i < 10; ++i) { + auto c0 = makeFlatVector( + batchSize, + [&](vector_size_t row) { return batchSize * i + row; }, + nullEvery(5)); + auto c1 = makeFlatVector( + batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11)); + vectors.push_back(makeRowVector({c0, c1, c1, c1, c1, c1})); + } + createDuckDbTable(vectors); + + testSingleKey(vectors, "c0"); +} + +TEST_F(OrderByTest, varfields) { + vector_size_t batchSize = 1000; + std::vector vectors; + for (int32_t i = 0; i < 5; ++i) { + auto c0 = makeFlatVector( + batchSize, + [&](vector_size_t row) { return batchSize * i + row; }, + nullEvery(5)); + auto c1 = makeFlatVector( + batchSize, [](vector_size_t row) { return row * 0.1; }, nullEvery(11)); + auto c2 = makeFlatVector( + batchSize, + [](vector_size_t row) { + return StringView::makeInline(std::to_string(row)); + }, + nullEvery(17)); + // TODO: Add support for array/map in createDuckDbTable and verify + // that we can sort by array/map as well. + vectors.push_back(makeRowVector({c0, c1, c2})); + } + createDuckDbTable(vectors); + + testSingleKey(vectors, "c2"); +} + +#if 0 +// flattening for scalar types unsupported in arrow! +TEST_F(OrderByTest, unknown) { + vector_size_t size = 1'000; + auto vector = makeRowVector({ + makeFlatVector(size, [](auto row) { return row % 7; }), + BaseVector::createNullConstant(UNKNOWN(), size, pool()), + }); + + // Exclude "UNKNOWN" column as DuckDB doesn't understand UNKNOWN type + createDuckDbTable( + {makeRowVector({vector->childAt(0)}), + makeRowVector({vector->childAt(0)})}); + + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values({vector, vector}) + .orderBy({"c0 DESC NULLS LAST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, + orderById, + "SELECT *, null FROM tmp ORDER BY c0 DESC NULLS LAST", + {0}); +} + +/// Verifies output batch rows of OrderBy +TEST_F(OrderByTest, outputBatchRows) { + struct { + int numRowsPerBatch; + int preferredOutBatchBytes; + int maxOutBatchRows; + int expectedOutputVectors; + + // TODO: add output size check with spilling enabled + std::string debugString() const { + return fmt::format( + "numRowsPerBatch:{}, preferredOutBatchBytes:{}, maxOutBatchRows:{}, expectedOutputVectors:{}", + numRowsPerBatch, + preferredOutBatchBytes, + maxOutBatchRows, + expectedOutputVectors); + } + } testSettings[] = { + {1024, 1, 100, 1024}, + // estimated size per row is ~2092, set preferredOutBatchBytes to 20920, + // so each batch has 10 rows, so it would return 100 batches + {1000, 20920, 100, 100}, + // same as above, but maxOutBatchRows is 1, so it would return 1000 + // batches + {1000, 20920, 1, 1000}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + const vector_size_t batchSize = testData.numRowsPerBatch; + std::vector rowVectors; + auto c0 = makeFlatVector( + batchSize, [&](vector_size_t row) { return row; }, nullEvery(5)); + auto c1 = makeFlatVector( + batchSize, [&](vector_size_t row) { return row; }, nullEvery(11)); + std::vector vectors; + vectors.push_back(c0); + for (int i = 0; i < 256; ++i) { + vectors.push_back(c1); + } + rowVectors.push_back(makeRowVector(vectors)); + createDuckDbTable(rowVectors); + + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values(rowVectors) + .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) + .capturePlanNodeId(orderById) + .planNode(); + auto queryCtx = core::QueryCtx::create(executor_.get()); + queryCtx->testingOverrideConfigUnsafe( + {{core::QueryConfig::kPreferredOutputBatchBytes, + std::to_string(testData.preferredOutBatchBytes)}, + {core::QueryConfig::kMaxOutputBatchRows, + std::to_string(testData.maxOutBatchRows)}}); + CursorParameters params; + params.planNode = plan; + params.queryCtx = queryCtx; + auto task = assertQueryOrdered( + params, "SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST", {0}); + EXPECT_EQ( + testData.expectedOutputVectors, + toPlanStats(task->taskStats()).at(orderById).outputVectors); + } +} +#endif + +} // namespace diff --git a/velox/experimental/cudf/vector/CMakeLists.txt b/velox/experimental/cudf/vector/CMakeLists.txt new file mode 100644 index 00000000000..d26f0b4c7dc --- /dev/null +++ b/velox/experimental/cudf/vector/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(velox_cudf_vector CudfVector.cpp) + +set_target_properties( + velox_cudf_vector + PROPERTIES CUDA_ARCHITECTURES native) + +target_link_libraries( + velox_cudf_vector + cudf::cudf + velox_exception + velox_common_base + velox_vector) diff --git a/velox/experimental/cudf/vector/CudfVector.cpp b/velox/experimental/cudf/vector/CudfVector.cpp new file mode 100644 index 00000000000..e4a9e845232 --- /dev/null +++ b/velox/experimental/cudf/vector/CudfVector.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/experimental/cudf/vector/CudfVector.h" + +namespace facebook::velox::cudf_velox { + +} // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/vector/CudfVector.h b/velox/experimental/cudf/vector/CudfVector.h new file mode 100644 index 00000000000..8ac20f4a3e1 --- /dev/null +++ b/velox/experimental/cudf/vector/CudfVector.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/buffer/Buffer.h" +#include "velox/common/memory/MemoryPool.h" +#include "velox/vector/ComplexVector.h" +#include "velox/vector/TypeAliases.h" + +#include +#include + +#include +#include + +namespace facebook::velox::cudf_velox { + +// Vector class which holds GPU data from cuDF. +class CudfVector : public RowVector { + public: + CudfVector( + velox::memory::MemoryPool* pool, + TypePtr type, + vector_size_t size, + std::unique_ptr&& table, + rmm::cuda_stream_view stream) + : RowVector( + pool, + std::move(type), + BufferPtr(nullptr), + size, + std::vector(), + std::nullopt), + table_{std::move(table)}, + stream_{stream} {} + + rmm::cuda_stream_view stream() const { + return stream_; + } + + cudf::table_view getTableView() const { + return table_->view(); + } + + std::unique_ptr&& release() { + return std::move(table_); + } + + cudf::table_view getTableView() const { + return table_->view(); + } + + private: + std::unique_ptr table_; + rmm::cuda_stream_view stream_; +}; + +using CudfVectorPtr = std::shared_ptr; + +} // namespace facebook::velox::cudf_velox From 2c03cbf7908227e25b5785e06705fee4ac0d2a63 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 11 Mar 2025 15:04:48 +0000 Subject: [PATCH 02/37] Remove all other operators from ToCudf --- velox/experimental/cudf/exec/ToCudf.cpp | 113 +++--------------------- 1 file changed, 11 insertions(+), 102 deletions(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 03ec29b2c95..cfd8f691cae 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -83,64 +83,23 @@ bool CompileState::compile() { return *it; }; - auto is_filter_project_supported = [](const exec::Operator* op) { - if (auto filter_project_op = dynamic_cast(op)) { - auto info = filter_project_op->exprsAndProjection(); - return !info.hasFilter && - ExpressionEvaluator::can_be_evaluated(info.exprs->exprs()); - } - return false; - }; - - auto is_join_supported = [get_plan_node](const exec::Operator* op) { - if (!is_any_of(op)) { - return false; - } - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(op->planNodeId())); - if (!plan_node) { - return false; - } - if (!plan_node->isInnerJoin()) { - return false; - } - if (plan_node->filter() != nullptr) { - return false; - } - return true; + auto is_supported_gpu_operator = [](const exec::Operator* op) { + return is_any_of(op); }; - auto is_supported_gpu_operator = - [is_filter_project_supported, - is_join_supported](const exec::Operator* op) { - return is_any_of< - exec::OrderBy, - exec::HashAggregation, - exec::LocalPartition, - exec::LocalExchange>(op) || - is_filter_project_supported(op) || is_join_supported(op); - }; - std::vector is_supported_gpu_operators(operators.size()); std::transform( operators.begin(), operators.end(), is_supported_gpu_operators.begin(), is_supported_gpu_operator); - auto accepts_gpu_input = [is_filter_project_supported, - is_join_supported](const exec::Operator* op) { - return is_any_of< - exec::OrderBy, - exec::HashAggregation, - exec::LocalPartition>(op) || - is_filter_project_supported(op) || is_join_supported(op); + + auto accepts_gpu_input = [](const exec::Operator* op) { + return is_any_of(op); }; - auto produces_gpu_output = [is_filter_project_supported, - is_join_supported](const exec::Operator* op) { - return is_any_of( - op) || - is_filter_project_supported(op) || - (is_any_of(op) && is_join_supported(op)); + + auto produces_gpu_output = [](const exec::Operator* op) { + return is_any_of(op); }; int32_t operatorsOffset = 0; @@ -166,28 +125,7 @@ bool CompileState::compile() { replace_op.back()->initialize(); } - // This is used to denote if the current operator is kept or replaced. - auto keep_operator = 0; - if (is_join_supported(oper)) { - if (auto joinBuildOp = dynamic_cast(oper)) { - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(joinBuildOp->planNodeId())); - VELOX_CHECK(plan_node != nullptr); - // From-Velox (optional) - replace_op.push_back( - std::make_unique(id, ctx, plan_node)); - replace_op.back()->initialize(); - } else if (auto joinProbeOp = dynamic_cast(oper)) { - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(joinProbeOp->planNodeId())); - VELOX_CHECK(plan_node != nullptr); - // From-Velox (optional) - replace_op.push_back( - std::make_unique(id, ctx, plan_node)); - replace_op.back()->initialize(); - // To-Velox (optional) - } - } else if (auto orderByOp = dynamic_cast(oper)) { + if (auto orderByOp = dynamic_cast(oper)) { auto id = orderByOp->operatorId(); auto plan_node = std::dynamic_pointer_cast( get_plan_node(orderByOp->planNodeId())); @@ -196,34 +134,6 @@ bool CompileState::compile() { replace_op.push_back(std::make_unique(id, ctx, plan_node)); replace_op.back()->initialize(); // To-velox (optional) - } else if (auto hashAggOp = dynamic_cast(oper)) { - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(hashAggOp->planNodeId())); - VELOX_CHECK(plan_node != nullptr); - replace_op.push_back( - std::make_unique(id, ctx, plan_node)); - replace_op.back()->initialize(); - } else if (is_filter_project_supported(oper)) { - auto filterProjectOp = dynamic_cast(oper); - auto info = filterProjectOp->exprsAndProjection(); - auto& id_projections = filterProjectOp->identityProjections(); - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(filterProjectOp->planNodeId())); - // If filter doesn't exist then project should definitely exist so this - // should never hit - VELOX_CHECK(plan_node != nullptr); - replace_op.push_back(std::make_unique( - id, ctx, info, id_projections, nullptr, plan_node)); - replace_op.back()->initialize(); - } else if ( - auto localPartitionOp = dynamic_cast(oper)) { - auto plan_node = - std::dynamic_pointer_cast( - get_plan_node(localPartitionOp->planNodeId())); - VELOX_CHECK(plan_node != nullptr); - replace_op.push_back( - std::make_unique(id, ctx, plan_node)); - replace_op.back()->initialize(); } if (next_operator_is_not_gpu and produces_gpu_output(oper)) { @@ -234,11 +144,10 @@ bool CompileState::compile() { } if (not replace_op.empty()) { - operatorsOffset += - replace_op.size() - 1 + keep_operator; // Check this "- 1" + operatorsOffset += replace_op.size() - 1; [[maybe_unused]] auto replaced = driverFactory_.replaceOperators( driver_, - replacingOperatorIndex + keep_operator, + replacingOperatorIndex, replacingOperatorIndex + 1, std::move(replace_op)); replacements_made = true; From e8461321f2c24c00cdb41d66b628970e09328fb1 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 11 Mar 2025 19:00:59 +0000 Subject: [PATCH 03/37] Make it compile --- velox/experimental/cudf/CMakeLists.txt | 1 - velox/experimental/cudf/exec/ToCudf.cpp | 10 ------- velox/experimental/cudf/tests/Main.cpp | 29 +++++++++++++++++++++ velox/experimental/cudf/vector/CudfVector.h | 4 --- 4 files changed, 29 insertions(+), 15 deletions(-) create mode 100644 velox/experimental/cudf/tests/Main.cpp diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt index 96fcdb0d557..e2be268915c 100644 --- a/velox/experimental/cudf/CMakeLists.txt +++ b/velox/experimental/cudf/CMakeLists.txt @@ -13,7 +13,6 @@ # limitations under the License. add_subdirectory(exec) -add_subdirectory(connectors) add_subdirectory(vector) if(VELOX_BUILD_TESTING) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index cfd8f691cae..c9dd04c9443 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -23,12 +23,7 @@ #include "velox/exec/Operator.h" #include "velox/exec/OrderBy.h" #include "velox/experimental/cudf/exec/CudfConversion.h" -#include "velox/experimental/cudf/exec/CudfFilterProject.h" -#include "velox/experimental/cudf/exec/CudfHashAggregation.h" -#include "velox/experimental/cudf/exec/CudfHashJoin.h" -#include "velox/experimental/cudf/exec/CudfLocalPartition.h" #include "velox/experimental/cudf/exec/CudfOrderBy.h" -#include "velox/experimental/cudf/exec/ExpressionEvaluator.h" #include "velox/experimental/cudf/exec/Utilities.h" #include @@ -232,11 +227,6 @@ void registerCudf() { CUDF_FUNC_RANGE(); cudaFree(0); // to init context. - if (cudfDebugEnabled()) { - std::cout << "Registering CudfHashJoinBridgeTranslator" << std::endl; - } - exec::Operator::registerOperator( - std::make_unique()); if (cudfDebugEnabled()) { std::cout << "Registering cudfDriverAdapter" << std::endl; } diff --git a/velox/experimental/cudf/tests/Main.cpp b/velox/experimental/cudf/tests/Main.cpp new file mode 100644 index 00000000000..164b6422fe8 --- /dev/null +++ b/velox/experimental/cudf/tests/Main.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/process/ThreadDebugInfo.h" + +#include +#include +#include + +// This main is needed for some tests on linux. +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + // Signal handler required for ThreadDebugInfoTest + facebook::velox::process::addDefaultFatalSignalHandler(); + folly::Init init(&argc, &argv, false); + return RUN_ALL_TESTS(); +} diff --git a/velox/experimental/cudf/vector/CudfVector.h b/velox/experimental/cudf/vector/CudfVector.h index 8ac20f4a3e1..db1590b3c08 100644 --- a/velox/experimental/cudf/vector/CudfVector.h +++ b/velox/experimental/cudf/vector/CudfVector.h @@ -59,10 +59,6 @@ class CudfVector : public RowVector { return std::move(table_); } - cudf::table_view getTableView() const { - return table_->view(); - } - private: std::unique_ptr table_; rmm::cuda_stream_view stream_; From 5ee96b3ffd5044c2eec519e57e36abeb4078b4aa Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 11 Mar 2025 19:43:13 +0000 Subject: [PATCH 04/37] remove unused interop code --- .../cudf/exec/VeloxCudfInterop.cpp | 324 ------------------ .../experimental/cudf/exec/VeloxCudfInterop.h | 23 +- 2 files changed, 2 insertions(+), 345 deletions(-) diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index 3fd2b046787..bc906ef1b4a 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -53,330 +53,6 @@ namespace facebook::velox::cudf_velox { -namespace { - -template -constexpr decltype(auto) -vector_encoding_dispatcher(VectorPtr vec, Functor f, Ts&&... args) { - using facebook::velox::VectorEncoding::Simple; - switch (vec->encoding()) { - case Simple::FLAT: - return f(vec->as>(), std::forward(args)...); - case Simple::DICTIONARY: - return f(vec->as>(), std::forward(args)...); - default: { - if (cudfDebugEnabled()) { - std::cout << "Unsupported Velox encoding: " << vec->encoding() - << std::endl; - } - CUDF_FAIL("Unsupported Velox encoding"); - } - } -} - -// TODO: dispatch other duration/timestamp types! -template -using cudf_storage_type_t = std::conditional_t< - std::is_same_v, - cudf::timestamp_D::rep, - cudf::device_storage_type_t>; - -} // namespace - -cudf::type_id velox_to_cudf_type_id(const TypePtr& type) { - if (cudfDebugEnabled()) { - std::cout << "Converting Velox type " << type->toString() << " to cudf" - << std::endl; - } - switch (type->kind()) { - case TypeKind::BOOLEAN: - return cudf::type_id::BOOL8; - case TypeKind::TINYINT: - return cudf::type_id::INT8; - case TypeKind::SMALLINT: - return cudf::type_id::INT16; - case TypeKind::INTEGER: - // TODO: handle interval types (durations?) - // if (type->isIntervalYearMonth()) { - // return cudf::type_id::...; - // } - if (type->isDate()) { - return cudf::type_id::TIMESTAMP_DAYS; - } - return cudf::type_id::INT32; - case TypeKind::BIGINT: - return cudf::type_id::INT64; - case TypeKind::REAL: - return cudf::type_id::FLOAT32; - case TypeKind::DOUBLE: - return cudf::type_id::FLOAT64; - case TypeKind::VARCHAR: - return cudf::type_id::STRING; - case TypeKind::VARBINARY: - return cudf::type_id::STRING; - case TypeKind::TIMESTAMP: - return cudf::type_id::TIMESTAMP_NANOSECONDS; - // case TypeKind::HUGEINT: return cudf::type_id::DURATION_DAYS; - // TODO: DATE was converted to a logical type: - // https://github.com/facebookincubator/velox/commit/e480f5c03a6c47897ef4488bd56918a89719f908 - // case TypeKind::DATE: return cudf::type_id::DURATION_DAYS; - // case TypeKind::INTERVAL_DAY_TIME: return cudf::type_id::EMPTY; - // TODO: Decimals are now logical types: - // https://github.com/facebookincubator/velox/commit/73d2f935b55f084d30557c7be94b9768efb8e56f - // case TypeKind::SHORT_DECIMAL: return cudf::type_id::DECIMAL64; - // case TypeKind::LONG_DECIMAL: return cudf::type_id::DECIMAL128; - // case TypeKind::ARRAY: return cudf::type_id::EMPTY; - // case TypeKind::MAP: return cudf::type_id::EMPTY; - case TypeKind::ROW: - return cudf::type_id::STRUCT; - // case TypeKind::UNKNOWN: return cudf::type_id::EMPTY; - // case TypeKind::FUNCTION: return cudf::type_id::EMPTY; - // case TypeKind::OPAQUE: return cudf::type_id::EMPTY; - // case TypeKind::INVALID: return cudf::type_id::EMPTY; - default: - CUDF_FAIL("Unsupported Velox type"); - return cudf::type_id::EMPTY; - } -} - -TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id) { - switch (type_id) { - case cudf::type_id::BOOL8: - return BOOLEAN(); - case cudf::type_id::INT8: - return TINYINT(); - case cudf::type_id::INT16: - return SMALLINT(); - case cudf::type_id::INT32: - return INTEGER(); - case cudf::type_id::INT64: - return BIGINT(); - case cudf::type_id::FLOAT32: - return REAL(); - case cudf::type_id::FLOAT64: - return DOUBLE(); - case cudf::type_id::STRING: - return VARCHAR(); - case cudf::type_id::TIMESTAMP_DAYS: - return DATE(); - case cudf::type_id::TIMESTAMP_NANOSECONDS: - return TIMESTAMP(); - // TODO: DATE is now a logical type - // case cudf::type_id::DURATION_DAYS: return ???; - // case cudf::type_id::EMPTY: return TypeKind::INTERVAL_DAY_TIME; - // TODO: DECIMAL is now a logical type - // case cudf::type_id::DECIMAL64: return TypeKind::SHORT_DECIMAL; - // case cudf::type_id::DECIMAL128: return TypeKind::LONG_DECIMAL; - // case cudf::type_id::EMPTY: return TypeKind::ARRAY; - // case cudf::type_id::EMPTY: return TypeKind::MAP; - // case cudf::type_id::STRUCT: - // // TODO: Need parametric type support? - // return ROW(); - // case cudf::type_id::EMPTY: return TypeKind::OPAQUE; - // case cudf::type_id::EMPTY: return TypeKind::UNKNOWN; - default: - return UNKNOWN(); - } -} - -// Convert a Velox vector to a CUDF column -struct copy_to_device { - rmm::cuda_stream_view stream; - - // Fixed width types - template < - typename T, - std::enable_if_t()>* = nullptr> - std::unique_ptr operator()(VectorPtr const& h_vec) const { - VELOX_CHECK_NOT_NULL(h_vec); - using velox_T = cudf_storage_type_t; - if (cudfDebugEnabled()) { - std::cout << "Converting fixed width column" << std::endl; - std::cout << "Encoding: " << h_vec->encoding() << std::endl; - std::cout << "Type: " << h_vec->type()->toString() << std::endl; - std::cout << "velox_T: " << typeid(velox_T{}).name() << std::endl; - } - auto velox_data = h_vec->as>(); - VELOX_CHECK_NOT_NULL(velox_data); - auto velox_data_ptr = velox_data->rawValues(); - cudf::host_span velox_host_span( - velox_data_ptr, int{h_vec->size()}); - auto d_v = cudf::detail::make_device_uvector_sync( - velox_host_span, stream, rmm::mr::get_current_device_resource()); - return std::make_unique( - std::move(d_v), rmm::device_buffer{}, 0); - } - - // Strings - template < - typename T, - std::enable_if_t>* = nullptr> - std::unique_ptr operator()(VectorPtr const& h_vec) const { - if (cudfDebugEnabled()) { - std::cout << "Converting string column" << std::endl; - } - - auto const num_rows = h_vec->size(); - auto h_offsets = std::vector(num_rows + 1); - h_offsets[0] = 0; - auto make_offsets = [&](auto const& vec) { - VELOX_CHECK_NOT_NULL(vec); - if (cudfDebugEnabled()) { - std::cout << "Starting offset calculation" << std::endl; - } - for (auto i = 0; i < num_rows; i++) { - h_offsets[i + 1] = h_offsets[i] + vec->valueAt(i).size(); - } - }; - vector_encoding_dispatcher(h_vec, make_offsets); - - auto d_offsets = cudf::detail::make_device_uvector_sync( - h_offsets, stream, rmm::mr::get_current_device_resource()); - - auto chars_size = h_offsets[num_rows]; - auto h_chars = std::vector(chars_size); - - auto make_chars = [&](auto vec) { - VELOX_CHECK_NOT_NULL(vec); - for (auto i = 0; i < num_rows; i++) { - auto const string_view = vec->valueAt(i); - auto const size = string_view.size(); - auto const offset = h_offsets[i]; - std::copy( - string_view.data(), - string_view.data() + size, - h_chars.begin() + offset); - } - }; - vector_encoding_dispatcher(h_vec, make_chars); - - auto d_chars = cudf::detail::make_device_uvector_sync( - h_chars, stream, rmm::mr::get_current_device_resource()); - - return cudf::make_strings_column( - num_rows, - std::make_unique( - std::move(d_offsets), rmm::device_buffer{}, 0), - d_chars.release(), - 0, - rmm::device_buffer{}); - } - - template < - typename T, - typename... Args, - std::enable_if_t< - not(cudf::is_rep_layout_compatible() or - std::is_same_v)>* = nullptr> - std::unique_ptr operator()(VectorPtr const& h_vec) const { - if (cudfDebugEnabled()) { - std::string error_message = "Unsupported type for to_cudf conversion: "; - error_message += h_vec->type()->toString(); - std::cout << error_message << std::endl; - } - CUDF_FAIL("Unsupported type for to_cudf conversion"); - } -}; - -// Row vector to table -// Vector to column -// template -std::unique_ptr to_cudf_table(const RowVectorPtr& leftBatch) { - VELOX_NVTX_FUNC_RANGE(); - // cudf type dispatcher to copy data from velox vector to cudf column - using cudf_col_ptr = std::unique_ptr; - std::vector cudf_columns; - auto copier = copy_to_device{cudf::get_default_stream()}; - for (auto const& h_vec : leftBatch->children()) { - auto cudf_kind = cudf::data_type{velox_to_cudf_type_id(h_vec->type())}; - auto cudf_column = cudf::type_dispatcher(cudf_kind, copier, h_vec); - cudf_columns.push_back(std::move(cudf_column)); - } - return std::make_unique(std::move(cudf_columns)); -} - -// Convert a CUDF column to a Velox vector -struct copy_to_host { - rmm::cuda_stream_view stream; - memory::MemoryPool* pool_; - - template - static constexpr bool is_supported() { - // return cudf::is_rep_layout_compatible(); - return cudf::is_numeric() and not std::is_same::value; - } - - // Fixed width types - template ()>* = nullptr> - VectorPtr operator()(TypePtr velox_type, cudf::column_view const& col) const { - auto velox_buffer = AlignedBuffer::allocate(col.size(), pool_); - auto velox_col = std::make_shared>( - pool_, - velox_type, - nullptr, - col.size(), - velox_buffer, - std::vector{}); - auto velox_data_ptr = velox_col->mutableRawValues(); - CUDF_CUDA_TRY(cudaMemcpyAsync( - velox_data_ptr, - col.data(), - col.size() * sizeof(T), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - return velox_col; - } - - template < - typename T, - typename... Args, - std::enable_if_t()>* = nullptr> - VectorPtr operator()(Args... args) const { - CUDF_FAIL("Unsupported type for to_velox conversion"); - } -}; - -VectorPtr to_velox_column( - const cudf::column_view& col, - memory::MemoryPool* pool) { - VELOX_NVTX_PRETTY_FUNC_RANGE(); - auto velox_type = cudf_type_id_to_velox_type(col.type().id()); - if (cudfDebugEnabled()) { - std::cout << "Converting to_velox_column: " << velox_type->toString() - << std::endl; - } - // cudf type dispatcher to copy data from cudf column to velox vector - auto copier = copy_to_host{cudf::get_default_stream(), pool}; - return cudf::type_dispatcher(col.type(), copier, velox_type, col); -} - -RowVectorPtr to_velox_column( - const cudf::table_view& table, - memory::MemoryPool* pool, - std::string name_prefix) { - VELOX_NVTX_PRETTY_FUNC_RANGE(); - std::vector children; - std::vector childNames; - std::vector> childTypes; - children.reserve(table.num_columns()); - childNames.reserve(table.num_columns()); - for (auto& col : table) { - children.push_back(to_velox_column(col, pool)); - childNames.push_back(name_prefix + std::to_string(childNames.size())); - } - - childTypes.reserve(children.size()); - for (const auto& child : children) { - childTypes.push_back(child->type()); - } - auto rowType = ROW(std::move(childNames), std::move(childTypes)); - const size_t vectorSize = children.empty() ? 0 : children.front()->size(); - - return std::make_shared( - pool, rowType, BufferPtr(nullptr), vectorSize, children); -} - namespace with_arrow { std::unique_ptr to_cudf_table( diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h index 4bd88c1a125..fbb4eda355e 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.h +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h @@ -24,25 +24,7 @@ #include #include -namespace facebook::velox::cudf_velox { - -cudf::type_id velox_to_cudf_type_id(const TypePtr& type); -TypePtr cudf_type_id_to_velox_type(cudf::type_id type_id); - -[[deprecated( - "Use with_arrow::to_cudf_table instead")]] std::unique_ptr -to_cudf_table(const facebook::velox::RowVectorPtr& leftBatch); -facebook::velox::VectorPtr to_velox_column( - const cudf::column_view& col, - facebook::velox::memory::MemoryPool* pool); -[[deprecated( - "Use with_arrow::to_velox_column instead")]] facebook::velox::RowVectorPtr -to_velox_column( - const cudf::table_view& table, - facebook::velox::memory::MemoryPool* pool, - std::string name_prefix = "c"); - -namespace with_arrow { +namespace facebook::velox::cudf_velox::with_arrow { std::unique_ptr to_cudf_table( const facebook::velox::RowVectorPtr& veloxTable, facebook::velox::memory::MemoryPool* pool, @@ -59,6 +41,5 @@ facebook::velox::RowVectorPtr to_velox_column( facebook::velox::memory::MemoryPool* pool, const std::vector& columnNames, rmm::cuda_stream_view stream); -} // namespace with_arrow -} // namespace facebook::velox::cudf_velox +} // namespace facebook::velox::cudf_velox::with_arrow From 46f0a94d600a7e5fef96106c963ba6a503b4d9aa Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 12 Mar 2025 17:07:07 +0000 Subject: [PATCH 05/37] Pin rmm and kvikio --- CMake/resolve_dependency_modules/cudf.cmake | 29 +++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake index a6abdb61254..0c57dbdcf01 100644 --- a/CMake/resolve_dependency_modules/cudf.cmake +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -14,6 +14,22 @@ include_guard(GLOBAL) +set(VELOX_rmm_VERSION 25.04) +set(VELOX_rmm_BUILD_SHA256_CHECKSUM + 294905094213a2d1fd8e024500359ff871bc52f913a3fbaca3514727c49f62de) +set(VELOX_rmm_SOURCE_URL + "https://github.com/rapidsai/rmm/archive/d8b7dacdeda302d2e37313c02d14ef5e1d1e98ea.tar.gz" +) +velox_resolve_dependency_url(rmm) + +set(VELOX_kvikio_VERSION 25.04) +set(VELOX_kvikio_BUILD_SHA256_CHECKSUM + 4a0b15295d0a397433930bf9a309e4ad2361b25dc7a7b3e6a35d0c9419d0cb62) +set(VELOX_kvikio_SOURCE_URL + "https://github.com/rapidsai/kvikio/archive/5c710f37236bda76e447e929e17b1efbc6c632c3.tar.gz" +) +velox_resolve_dependency_url(kvikio) + set(VELOX_cudf_VERSION 25.04) set(VELOX_cudf_BUILD_SHA256_CHECKSUM e5a1900dfaf23dab2c5808afa17a2d04fa867d2892ecec1cb37908f3b73715c2) @@ -36,6 +52,19 @@ string( APPEND CMAKE_CXX_FLAGS " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy") +FetchContent_Declare( + rmm + URL ${VELOX_rmm_SOURCE_URL} + URL_HASH ${VELOX_rmm_BUILD_SHA256_CHECKSUM} + UPDATE_DISCONNECTED 1) + +FetchContent_Declare( + kvikio + URL ${VELOX_kvikio_SOURCE_URL} + URL_HASH ${VELOX_kvikio_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR cpp + UPDATE_DISCONNECTED 1) + FetchContent_Declare( cudf URL ${VELOX_cudf_SOURCE_URL} From 47913ae957634e7a767c7ace145a91957b8261f4 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 12 Mar 2025 18:31:58 +0000 Subject: [PATCH 06/37] Remove manually adding fmt --- CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 173a40fe1d3..80c13aa430a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,11 +378,6 @@ endif() message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}") -if(NOT TARGET fmt::fmt) - velox_set_source(fmt) - velox_resolve_dependency(fmt 9.0.0) -endif() - if(VELOX_ENABLE_GPU) enable_language(CUDA) # Determine CUDA_ARCHITECTURES automatically. From 5effbf2099f62ac127ee6a9e4afe5aac766ecc54 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 12 Mar 2025 18:32:52 +0000 Subject: [PATCH 07/37] Remove debug prints --- .../experimental/cudf/exec/CudfConversion.cpp | 13 ----- velox/experimental/cudf/exec/CudfOrderBy.cpp | 9 ---- velox/experimental/cudf/exec/ToCudf.cpp | 53 ------------------- velox/experimental/cudf/exec/Utilities.cpp | 5 -- velox/experimental/cudf/exec/Utilities.h | 6 --- 5 files changed, 86 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index 68b3ac283de..e9d6b161812 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -121,13 +121,6 @@ RowVectorPtr CudfFromVelox::getOutput() { VELOX_CHECK_NOT_NULL(tbl); - if (cudfDebugEnabled()) { - std::cout << "CudfFromVelox table number of columns: " << tbl->num_columns() - << std::endl; - std::cout << "CudfFromVelox table number of rows: " << tbl->num_rows() - << std::endl; - } - // Return a CudfVector that owns the cudf table auto const size = tbl->num_rows(); return std::make_shared( @@ -173,12 +166,6 @@ RowVectorPtr CudfToVelox::getOutput() { inputs_.pop_front(); VELOX_CHECK_NOT_NULL(tbl); - if (cudfDebugEnabled()) { - std::cout << "CudfToVelox table number of columns: " << tbl->num_columns() - << std::endl; - std::cout << "CudfToVelox table number of rows: " << tbl->num_rows() - << std::endl; - } if (tbl->num_rows() == 0) { return nullptr; } diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index ac32cc8e23f..ad897d3b68f 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -61,9 +61,6 @@ CudfOrderBy::CudfOrderBy( ? cudf::null_order::BEFORE : cudf::null_order::AFTER); } - if (cudfDebugEnabled()) { - std::cout << "Number of Sort keys: " << sort_keys_.size() << std::endl; - } } void CudfOrderBy::addInput(RowVectorPtr input) { @@ -94,12 +91,6 @@ void CudfOrderBy::noMoreInput() { inputs_.clear(); VELOX_CHECK_NOT_NULL(tbl); - if (cudfDebugEnabled()) { - std::cout << "Sort input table number of columns: " << tbl->num_columns() - << std::endl; - std::cout << "Sort input table number of rows: " << tbl->num_rows() - << std::endl; - } auto keys = tbl->view().select(sort_keys_); auto values = tbl->view(); diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index c9dd04c9443..2741ee246aa 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -42,25 +42,9 @@ bool is_any_of(const Base* p) { static bool _cudfIsRegistered = false; bool CompileState::compile() { - if (cudfDebugEnabled()) { - std::cout << "Calling cudfDriverAdapter" << std::endl; - } - auto operators = driver_.operators(); auto& nodes = planNodes_; - if (cudfDebugEnabled()) { - std::cout << "Number of operators: " << operators.size() << std::endl; - for (auto& op : operators) { - std::cout << " Operator: ID " << op->operatorId() << ": " - << op->toString() << std::endl; - } - std::cout << "Number of plan nodes: " << nodes.size() << std::endl; - for (auto& node : nodes) { - std::cout << " Plan node: ID " << node->id() << ": " << node->toString(); - } - } - // Make sure operator states are initialized. We will need to inspect some of // them during the transformation. driver_.initializeOperators(); @@ -149,14 +133,6 @@ bool CompileState::compile() { } } - if (cudfDebugEnabled()) { - operators = driver_.operators(); - std::cout << "Number of new operators: " << operators.size() << std::endl; - for (auto& op : operators) { - std::cout << " Operator: ID " << op->operatorId() << ": " - << op->toString() << std::endl; - } - } return replacements_made; } @@ -167,30 +143,14 @@ struct cudfDriverAdapter { cudfDriverAdapter(std::shared_ptr mr) : mr_(mr) { - if (cudfDebugEnabled()) { - std::cout << "cudfDriverAdapter constructor" << std::endl; - } planNodes_ = std::make_shared>>(); } - ~cudfDriverAdapter() { - if (cudfDebugEnabled()) { - std::cout << "cudfDriverAdapter destructor" << std::endl; - printf( - "cached planNodes_ %p, %ld\n", - planNodes_.get(), - planNodes_.use_count()); - } - } - // Call operator needed by DriverAdapter bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) { auto state = CompileState(factory, driver, *planNodes_); // Stored planNodes_ from inspect. - if (cudfDebugEnabled()) { - printf("driver.planNodes_=%p\n", planNodes_.get()); - } auto res = state.compile(); return res; } @@ -209,9 +169,6 @@ struct cudfDriverAdapter { // signature: std::function inspect; // call: adapter.inspect(planFragment); planNodes_->clear(); - if (cudfDebugEnabled()) { - std::cout << "Inspecting PlanFragment" << std::endl; - } if (planNodes_) { storePlanNodes(planFragment.planNode); } @@ -227,15 +184,8 @@ void registerCudf() { CUDF_FUNC_RANGE(); cudaFree(0); // to init context. - if (cudfDebugEnabled()) { - std::cout << "Registering cudfDriverAdapter" << std::endl; - } - const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async"; - if (cudfDebugEnabled()) { - std::cout << "Setting cuDF memory resource to " << mr_mode << std::endl; - } auto mr = cudf_velox::create_memory_resource(mr_mode); cudf::set_current_device_resource(mr.get()); cudfDriverAdapter cda{mr}; @@ -245,9 +195,6 @@ void registerCudf() { } void unregisterCudf() { - if (cudfDebugEnabled()) { - std::cout << "Unregistering cudfDriverAdapter" << std::endl; - } exec::DriverFactory::adapters.clear(); _cudfIsRegistered = false; } diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp index f12b08a341c..3cd3d4d5b22 100644 --- a/velox/experimental/cudf/exec/Utilities.cpp +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -90,11 +90,6 @@ cudf::detail::cuda_stream_pool& cudfGlobalStreamPool() { return cudf::detail::global_cuda_stream_pool(); }; -bool cudfDebugEnabled() { - const char* env_cudf_debug = std::getenv("VELOX_CUDF_DEBUG"); - return env_cudf_debug != nullptr && std::stoi(env_cudf_debug); -} - std::unique_ptr concatenateTables( std::vector> tables, rmm::cuda_stream_view stream) { diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h index 87a496a8cff..73a88490d0f 100644 --- a/velox/experimental/cudf/exec/Utilities.h +++ b/velox/experimental/cudf/exec/Utilities.h @@ -38,12 +38,6 @@ create_memory_resource(std::string_view mode); */ [[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool(); -/** - * @brief Returns true if the VELOX_CUDF_DEBUG environment variable is set to a - * nonzero value. - */ -bool cudfDebugEnabled(); - // Concatenate a vector of cuDF tables into a single table std::unique_ptr concatenateTables( std::vector> tables, From 0e43fc172dbc2b49f9812e47189e401555ce6681 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 12 Mar 2025 18:45:13 +0000 Subject: [PATCH 08/37] Re-enable some warnings --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80c13aa430a..d7966304c1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -369,8 +369,6 @@ if(ENABLE_ALL_WARNINGS) -Wno-unused-parameter \ -Wno-sign-compare \ -Wno-ignored-qualifiers \ - -Wno-missing-field-initializers \ - -Wno-deprecated-copy \ ${KNOWN_COMPILER_SPECIFIC_WARNINGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra ${KNOWN_WARNINGS}") From 9dc09e1f117597f37f43ef112ced05f2d44d85e6 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 13 Mar 2025 07:35:59 +0000 Subject: [PATCH 09/37] update cmake on centos --- scripts/setup-centos9.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setup-centos9.sh b/scripts/setup-centos9.sh index 2ce9f869317..6c72f96c9ec 100755 --- a/scripts/setup-centos9.sh +++ b/scripts/setup-centos9.sh @@ -68,7 +68,7 @@ function install_build_prerequisites { dnf_install ninja-build cmake ccache gcc-toolset-12 git wget which dnf_install autoconf automake python3-devel pip libtool - pip install cmake==3.28.3 + pip install cmake==3.30.4 if [[ ${USE_CLANG} != "false" ]]; then install_clang15 From e0ccfbdc6bb63534c0610e234c82eb7b20d0071b Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 13 Mar 2025 07:38:09 +0000 Subject: [PATCH 10/37] Add our team to codeowners --- .github/CODEOWNERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c24be384421..649e3694e27 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -32,6 +32,9 @@ scripts/ @assignUser @majetideepak # Breeze velox/experimental/breeze @dreveman +# cuDF +velox/experimental/cudf @bdice @karthikeyann @devavret + # Parquet velox/dwio/parquet/ @majetideepak From d00db4f4aacb5da3cecd5dea4143952e89a88564 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 13 Mar 2025 07:46:33 +0000 Subject: [PATCH 11/37] Check off some todos --- velox/experimental/cudf/exec/CudfConversion.cpp | 4 ++-- velox/experimental/cudf/exec/CudfOrderBy.cpp | 3 --- velox/experimental/cudf/exec/CudfOrderBy.h | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index e9d6b161812..9b4f02ad62e 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -130,6 +130,7 @@ RowVectorPtr CudfFromVelox::getOutput() { void CudfFromVelox::close() { cudf::get_default_stream().synchronize(); exec::Operator::close(); + inputs_.clear(); } CudfToVelox::CudfToVelox( @@ -179,8 +180,7 @@ RowVectorPtr CudfToVelox::getOutput() { void CudfToVelox::close() { exec::Operator::close(); - // TODO: Release stored inputs if needed - // TODO: Release cudf memory resources + inputs_.clear(); } } // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index ad897d3b68f..3667a8b8000 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -41,7 +41,6 @@ CudfOrderBy::CudfOrderBy( "CudfOrderBy"), NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise orderByNode_(orderByNode) { - maxOutputRows_ = outputBatchRows(std::nullopt); sort_keys_.reserve(orderByNode->sortingKeys().size()); column_order_.reserve(orderByNode->sortingKeys().size()); null_order_.reserve(orderByNode->sortingKeys().size()); @@ -74,8 +73,6 @@ void CudfOrderBy::addInput(RowVectorPtr input) { void CudfOrderBy::noMoreInput() { exec::Operator::noMoreInput(); - // TODO: Get total row count, batch output - // maxOutputRows_ = outputBatchRows(total_row_count); VELOX_NVTX_OPERATOR_FUNC_RANGE(); diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h index 28c89cec8e4..1583494e2f3 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.h +++ b/velox/experimental/cudf/exec/CudfOrderBy.h @@ -63,7 +63,6 @@ class CudfOrderBy : public exec::Operator, public NvtxHelper { std::vector column_order_; std::vector null_order_; bool finished_{false}; - uint32_t maxOutputRows_; }; } // namespace facebook::velox::cudf_velox From 5accb509db8d08c2be56979615bdac39a10928c2 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 13 Mar 2025 09:12:35 +0000 Subject: [PATCH 12/37] remove commented tests --- velox/experimental/cudf/tests/OrderByTest.cpp | 96 ------------------- 1 file changed, 96 deletions(-) diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp index ff47198a3a8..86ad291a3ff 100644 --- a/velox/experimental/cudf/tests/OrderByTest.cpp +++ b/velox/experimental/cudf/tests/OrderByTest.cpp @@ -31,7 +31,6 @@ using namespace facebook::velox::exec; using namespace facebook::velox::exec::test; using namespace facebook::velox::common::testutil; -using facebook::velox::test::BatchMaker; namespace { class OrderByTest : public OperatorTestBase { @@ -312,8 +311,6 @@ TEST_F(OrderByTest, varfields) { return StringView::makeInline(std::to_string(row)); }, nullEvery(17)); - // TODO: Add support for array/map in createDuckDbTable and verify - // that we can sort by array/map as well. vectors.push_back(makeRowVector({c0, c1, c2})); } createDuckDbTable(vectors); @@ -321,97 +318,4 @@ TEST_F(OrderByTest, varfields) { testSingleKey(vectors, "c2"); } -#if 0 -// flattening for scalar types unsupported in arrow! -TEST_F(OrderByTest, unknown) { - vector_size_t size = 1'000; - auto vector = makeRowVector({ - makeFlatVector(size, [](auto row) { return row % 7; }), - BaseVector::createNullConstant(UNKNOWN(), size, pool()), - }); - - // Exclude "UNKNOWN" column as DuckDB doesn't understand UNKNOWN type - createDuckDbTable( - {makeRowVector({vector->childAt(0)}), - makeRowVector({vector->childAt(0)})}); - - core::PlanNodeId orderById; - auto plan = PlanBuilder() - .values({vector, vector}) - .orderBy({"c0 DESC NULLS LAST"}, false) - .capturePlanNodeId(orderById) - .planNode(); - runTest( - plan, - orderById, - "SELECT *, null FROM tmp ORDER BY c0 DESC NULLS LAST", - {0}); -} - -/// Verifies output batch rows of OrderBy -TEST_F(OrderByTest, outputBatchRows) { - struct { - int numRowsPerBatch; - int preferredOutBatchBytes; - int maxOutBatchRows; - int expectedOutputVectors; - - // TODO: add output size check with spilling enabled - std::string debugString() const { - return fmt::format( - "numRowsPerBatch:{}, preferredOutBatchBytes:{}, maxOutBatchRows:{}, expectedOutputVectors:{}", - numRowsPerBatch, - preferredOutBatchBytes, - maxOutBatchRows, - expectedOutputVectors); - } - } testSettings[] = { - {1024, 1, 100, 1024}, - // estimated size per row is ~2092, set preferredOutBatchBytes to 20920, - // so each batch has 10 rows, so it would return 100 batches - {1000, 20920, 100, 100}, - // same as above, but maxOutBatchRows is 1, so it would return 1000 - // batches - {1000, 20920, 1, 1000}}; - - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - const vector_size_t batchSize = testData.numRowsPerBatch; - std::vector rowVectors; - auto c0 = makeFlatVector( - batchSize, [&](vector_size_t row) { return row; }, nullEvery(5)); - auto c1 = makeFlatVector( - batchSize, [&](vector_size_t row) { return row; }, nullEvery(11)); - std::vector vectors; - vectors.push_back(c0); - for (int i = 0; i < 256; ++i) { - vectors.push_back(c1); - } - rowVectors.push_back(makeRowVector(vectors)); - createDuckDbTable(rowVectors); - - core::PlanNodeId orderById; - auto plan = PlanBuilder() - .values(rowVectors) - .orderBy({fmt::format("{} ASC NULLS LAST", "c0")}, false) - .capturePlanNodeId(orderById) - .planNode(); - auto queryCtx = core::QueryCtx::create(executor_.get()); - queryCtx->testingOverrideConfigUnsafe( - {{core::QueryConfig::kPreferredOutputBatchBytes, - std::to_string(testData.preferredOutBatchBytes)}, - {core::QueryConfig::kMaxOutputBatchRows, - std::to_string(testData.maxOutBatchRows)}}); - CursorParameters params; - params.planNode = plan; - params.queryCtx = queryCtx; - auto task = assertQueryOrdered( - params, "SELECT * FROM tmp ORDER BY c0 ASC NULLS LAST", {0}); - EXPECT_EQ( - testData.expectedOutputVectors, - toPlanStats(task->taskStats()).at(orderById).outputVectors); - } -} -#endif - } // namespace From 5fe25754128ae8c324f1a87a13ea50111a7c09fc Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 17 Mar 2025 09:48:48 +0000 Subject: [PATCH 13/37] Ignore known warnings just for cudf_exec target --- velox/experimental/cudf/exec/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt index c47d3a5065a..fc6a5f4cf69 100644 --- a/velox/experimental/cudf/exec/CMakeLists.txt +++ b/velox/experimental/cudf/exec/CMakeLists.txt @@ -32,3 +32,8 @@ target_link_libraries( velox_exception velox_common_base velox_exec) + +target_compile_options( + velox_cudf_exec + PRIVATE + -Wno-missing-field-initializers) From c9b2c1aeeedb0662a8bc99bd11eb78eeaff38ba4 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 17 Mar 2025 09:51:12 +0000 Subject: [PATCH 14/37] Misc review fixes: - West const - header cleanup - nodiscard --- .../experimental/cudf/exec/CudfConversion.cpp | 6 ++--- velox/experimental/cudf/exec/CudfOrderBy.cpp | 3 --- velox/experimental/cudf/exec/CudfOrderBy.h | 3 --- velox/experimental/cudf/exec/Utilities.cpp | 24 +++++++++---------- velox/experimental/cudf/exec/Utilities.h | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index 9b4f02ad62e..bed0e7e0e9e 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -92,8 +92,8 @@ void CudfFromVelox::addInput(RowVectorPtr input) { RowVectorPtr CudfFromVelox::getOutput() { VELOX_NVTX_OPERATOR_FUNC_RANGE(); - auto const target_output_size = preferred_gpu_batch_size_rows(); - auto const exit_early = finished_ or + const auto target_output_size = preferred_gpu_batch_size_rows(); + const auto exit_early = finished_ or (current_output_size_ < target_output_size and not noMoreInput_) or inputs_.empty(); finished_ = noMoreInput_; @@ -122,7 +122,7 @@ RowVectorPtr CudfFromVelox::getOutput() { VELOX_CHECK_NOT_NULL(tbl); // Return a CudfVector that owns the cudf table - auto const size = tbl->num_rows(); + const auto size = tbl->num_rows(); return std::make_shared( input->pool(), outputType_, size, std::move(tbl), stream); } diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index 3667a8b8000..796143ca26c 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -13,9 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "velox/exec/Driver.h" -#include "velox/exec/Operator.h" -#include "velox/vector/ComplexVector.h" #include #include diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h index 1583494e2f3..481be54d16e 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.h +++ b/velox/experimental/cudf/exec/CudfOrderBy.h @@ -16,9 +16,6 @@ #pragma once -#include "velox/core/Expressions.h" -#include "velox/core/PlanNode.h" -#include "velox/exec/Driver.h" #include "velox/exec/Operator.h" #include "velox/experimental/cudf/exec/NvtxHelper.h" #include "velox/experimental/cudf/vector/CudfVector.h" diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp index 3cd3d4d5b22..19c2e2b9646 100644 --- a/velox/experimental/cudf/exec/Utilities.cpp +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -14,10 +14,6 @@ * limitations under the License. */ -#include -#include -#include - #include "velox/experimental/cudf/exec/Utilities.h" #include @@ -36,32 +32,36 @@ #include #include +#include +#include +#include + namespace facebook::velox::cudf_velox { namespace { -auto make_cuda_mr() { +[[nodiscard]] auto make_cuda_mr() { return std::make_shared(); } -auto make_pool_mr() { +[[nodiscard]] auto make_pool_mr() { return rmm::mr::make_owning_wrapper( make_cuda_mr(), rmm::percent_of_free_device_memory(50)); } -auto make_async_mr() { +[[nodiscard]] auto make_async_mr() { return std::make_shared(); } -auto make_managed_mr() { +[[nodiscard]] auto make_managed_mr() { return std::make_shared(); } -auto make_arena_mr() { +[[nodiscard]] auto make_arena_mr() { return rmm::mr::make_owning_wrapper( make_cuda_mr()); } -auto make_managed_pool_mr() { +[[nodiscard]] auto make_managed_pool_mr() { return rmm::mr::make_owning_wrapper( make_managed_mr(), rmm::percent_of_free_device_memory(50)); } @@ -105,7 +105,7 @@ std::unique_ptr concatenateTables( tables.begin(), tables.end(), std::back_inserter(tableViews), - [&](auto const& tbl) { return tbl->view(); }); + [&](const auto& tbl) { return tbl->view(); }); return cudf::concatenate( tableViews, stream, cudf::get_current_device_resource_ref()); } @@ -122,7 +122,7 @@ std::unique_ptr getConcatenatedTable( inputStreams.reserve(tables.size()); tableViews.reserve(tables.size()); - for (auto const& table : tables) { + for (const auto& table : tables) { VELOX_CHECK_NOT_NULL(table); tableViews.push_back(table->getTableView()); inputStreams.push_back(table->stream()); diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h index 73a88490d0f..e3480c7d41c 100644 --- a/velox/experimental/cudf/exec/Utilities.h +++ b/velox/experimental/cudf/exec/Utilities.h @@ -39,14 +39,14 @@ create_memory_resource(std::string_view mode); [[nodiscard]] cudf::detail::cuda_stream_pool& cudfGlobalStreamPool(); // Concatenate a vector of cuDF tables into a single table -std::unique_ptr concatenateTables( +[[nodiscard]] std::unique_ptr concatenateTables( std::vector> tables, rmm::cuda_stream_view stream); // Concatenate a vector of cuDF tables into a single table. // This function joins the streams owned by individual tables on the passed // stream. Inputs are not safe to use after calling this function. -std::unique_ptr getConcatenatedTable( +[[nodiscard]] std::unique_ptr getConcatenatedTable( std::vector& tables, rmm::cuda_stream_view stream); From 5a75b9ec5180f56e7f7c417b9c2892d169f4c3b9 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 19 Mar 2025 13:09:26 +0000 Subject: [PATCH 15/37] Misc review changes requested by @bdice --- CMake/resolve_dependency_modules/cudf.cmake | 14 ++++++++++ CMakeLists.txt | 6 ++++- velox/experimental/cudf/CMakeLists.txt | 1 - velox/experimental/cudf/exec/CMakeLists.txt | 4 --- velox/experimental/cudf/exec/ToCudf.cpp | 12 ++++++--- .../cudf/exec/VeloxCudfInterop.cpp | 2 +- velox/experimental/cudf/vector/CMakeLists.txt | 26 ------------------- velox/experimental/cudf/vector/CudfVector.cpp | 21 --------------- 8 files changed, 29 insertions(+), 57 deletions(-) delete mode 100644 velox/experimental/cudf/vector/CMakeLists.txt delete mode 100644 velox/experimental/cudf/vector/CudfVector.cpp diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake index 0c57dbdcf01..4c9d015dbf1 100644 --- a/CMake/resolve_dependency_modules/cudf.cmake +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -14,6 +14,14 @@ include_guard(GLOBAL) +set(VELOX_rapids_cmake_VERSION 25.04) +set(VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM + 458c14eaff9000067b32d65c8c914f4521090ede7690e16eb57035ce731386db) +set(VELOX_rapids_cmake_SOURCE_URL + "https://github.com/rapidsai/rapids-cmake/archive/7828fc8ff2e9f4fa86099f3c844505c2f47ac672.tar.gz" +) +velox_resolve_dependency_url(rapids_cmake) + set(VELOX_rmm_VERSION 25.04) set(VELOX_rmm_BUILD_SHA256_CHECKSUM 294905094213a2d1fd8e024500359ff871bc52f913a3fbaca3514727c49f62de) @@ -52,6 +60,12 @@ string( APPEND CMAKE_CXX_FLAGS " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy") +FetchContent_Declare( + rapids-cmake + URL ${VELOX_rapids_cmake_SOURCE_URL} + URL_HASH ${VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM} + UPDATE_DISCONNECTED 1) + FetchContent_Declare( rmm URL ${VELOX_rmm_SOURCE_URL} diff --git a/CMakeLists.txt b/CMakeLists.txt index d7966304c1b..b93de09121f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,8 @@ find_package(OpenSSL REQUIRED) if(VELOX_ENABLE_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER - AND NOT CMAKE_CXX_COMPILER_LAUNCHER) + AND NOT CMAKE_CXX_COMPILER_LAUNCHER + AND NOT CMAKE_CUDA_COMPILER_LAUNCHER) find_program(CCACHE_FOUND ccache) @@ -464,6 +465,9 @@ else() endif() velox_resolve_dependency(glog) +velox_set_source(fmt) +velox_resolve_dependency(fmt 9.0.0) + if(${VELOX_BUILD_MINIMAL_WITH_DWIO} OR ${VELOX_ENABLE_HIVE_CONNECTOR}) # DWIO needs all sorts of stream compression libraries. # diff --git a/velox/experimental/cudf/CMakeLists.txt b/velox/experimental/cudf/CMakeLists.txt index e2be268915c..6d400056c35 100644 --- a/velox/experimental/cudf/CMakeLists.txt +++ b/velox/experimental/cudf/CMakeLists.txt @@ -13,7 +13,6 @@ # limitations under the License. add_subdirectory(exec) -add_subdirectory(vector) if(VELOX_BUILD_TESTING) add_subdirectory(tests) diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt index fc6a5f4cf69..51cd5bde74c 100644 --- a/velox/experimental/cudf/exec/CMakeLists.txt +++ b/velox/experimental/cudf/exec/CMakeLists.txt @@ -20,10 +20,6 @@ add_library( Utilities.cpp VeloxCudfInterop.cpp) -set_target_properties( - velox_cudf_exec - PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries( velox_cudf_exec cudf::cudf diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 2741ee246aa..957be223efe 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -34,11 +34,15 @@ namespace facebook::velox::cudf_velox { +namespace { + template bool is_any_of(const Base* p) { return ((dynamic_cast(p) != nullptr) || ...); } +} // namespace + static bool _cudfIsRegistered = false; bool CompileState::compile() { @@ -109,10 +113,8 @@ bool CompileState::compile() { auto plan_node = std::dynamic_pointer_cast( get_plan_node(orderByOp->planNodeId())); VELOX_CHECK(plan_node != nullptr); - // From-velox (optional) replace_op.push_back(std::make_unique(id, ctx, plan_node)); replace_op.back()->initialize(); - // To-velox (optional) } if (next_operator_is_not_gpu and produces_gpu_output(oper)) { @@ -176,13 +178,17 @@ struct cudfDriverAdapter { }; void registerCudf() { + if (cudfIsRegistered()) { + return; + } + const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED"); if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) { return; } CUDF_FUNC_RANGE(); - cudaFree(0); // to init context. + cudaFree(0); // Initialize CUDA context at startup const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async"; diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index bc906ef1b4a..cf0f202da6c 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -56,7 +56,7 @@ namespace facebook::velox::cudf_velox { namespace with_arrow { std::unique_ptr to_cudf_table( - const facebook::velox::RowVectorPtr& veloxTable, // BaseVector or RowVector? + const facebook::velox::RowVectorPtr& veloxTable, facebook::velox::memory::MemoryPool* pool, rmm::cuda_stream_view stream) { // Need to flattenDictionary and flattenConstant, otherwise we observe issues diff --git a/velox/experimental/cudf/vector/CMakeLists.txt b/velox/experimental/cudf/vector/CMakeLists.txt deleted file mode 100644 index d26f0b4c7dc..00000000000 --- a/velox/experimental/cudf/vector/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_library(velox_cudf_vector CudfVector.cpp) - -set_target_properties( - velox_cudf_vector - PROPERTIES CUDA_ARCHITECTURES native) - -target_link_libraries( - velox_cudf_vector - cudf::cudf - velox_exception - velox_common_base - velox_vector) diff --git a/velox/experimental/cudf/vector/CudfVector.cpp b/velox/experimental/cudf/vector/CudfVector.cpp deleted file mode 100644 index e4a9e845232..00000000000 --- a/velox/experimental/cudf/vector/CudfVector.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/experimental/cudf/vector/CudfVector.h" - -namespace facebook::velox::cudf_velox { - -} // namespace facebook::velox::cudf_velox From 3f4ca09f07a5414ccc0d6ddf8809ef37963dbb1e Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 19 Mar 2025 13:43:21 +0000 Subject: [PATCH 16/37] Remove only cudf adapter --- velox/experimental/cudf/exec/ToCudf.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 957be223efe..90cb5988fef 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -201,7 +201,15 @@ void registerCudf() { } void unregisterCudf() { - exec::DriverFactory::adapters.clear(); + exec::DriverFactory::adapters.erase( + std::remove_if( + exec::DriverFactory::adapters.begin(), + exec::DriverFactory::adapters.end(), + [](const exec::DriverAdapter& adapter) { + return adapter.label == "cuDF"; + }), + exec::DriverFactory::adapters.end()); + _cudfIsRegistered = false; } From 6f7d72e38c37ac23a906b4938fa86fc6dc8878e9 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 19 Mar 2025 14:14:12 +0000 Subject: [PATCH 17/37] Add clang format to our subdir --- velox/experimental/cudf/.clang-format | 27 +++++++++++++++++++ .../experimental/cudf/exec/CudfConversion.cpp | 12 ++++----- velox/experimental/cudf/exec/CudfConversion.h | 6 ++--- velox/experimental/cudf/exec/CudfOrderBy.cpp | 10 +++---- velox/experimental/cudf/exec/CudfOrderBy.h | 3 ++- velox/experimental/cudf/exec/ToCudf.cpp | 7 ++--- velox/experimental/cudf/exec/Utilities.cpp | 12 ++++----- velox/experimental/cudf/exec/Utilities.h | 7 ++--- .../cudf/exec/VeloxCudfInterop.cpp | 14 +++++----- velox/experimental/cudf/tests/OrderByTest.cpp | 9 ++++--- 10 files changed, 69 insertions(+), 38 deletions(-) create mode 100644 velox/experimental/cudf/.clang-format diff --git a/velox/experimental/cudf/.clang-format b/velox/experimental/cudf/.clang-format new file mode 100644 index 00000000000..7b028e6ff68 --- /dev/null +++ b/velox/experimental/cudf/.clang-format @@ -0,0 +1,27 @@ +BasedOnStyle: InheritParentConfig +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^"velox/experimental/' # velox/experimental includes + Priority: 0 + - Regex: '^"' # quoted includes + Priority: 1 + - Regex: '^<(benchmarks|tests)/' # benchmark includes + Priority: 2 + - Regex: '^ #include -#include "velox/experimental/cudf/exec/CudfConversion.h" -#include "velox/experimental/cudf/exec/NvtxHelper.h" -#include "velox/experimental/cudf/exec/Utilities.h" -#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" -#include "velox/experimental/cudf/vector/CudfVector.h" - namespace facebook::velox::cudf_velox { namespace { diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h index c75f8464b36..480912256f9 100644 --- a/velox/experimental/cudf/exec/CudfConversion.h +++ b/velox/experimental/cudf/exec/CudfConversion.h @@ -16,15 +16,15 @@ #pragma once +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/vector/CudfVector.h" + #include "velox/exec/Driver.h" #include "velox/exec/Operator.h" #include "velox/vector/ComplexVector.h" #include -#include "velox/experimental/cudf/exec/NvtxHelper.h" -#include "velox/experimental/cudf/vector/CudfVector.h" - #include #include #include diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index 796143ca26c..cb6f34f7dda 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include -#include -#include -#include - #include "velox/experimental/cudf/exec/CudfOrderBy.h" #include "velox/experimental/cudf/exec/NvtxHelper.h" #include "velox/experimental/cudf/exec/Utilities.h" #include "velox/experimental/cudf/exec/VeloxCudfInterop.h" +#include +#include +#include +#include + namespace facebook::velox::cudf_velox { CudfOrderBy::CudfOrderBy( diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h index 481be54d16e..29b225e1e2f 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.h +++ b/velox/experimental/cudf/exec/CudfOrderBy.h @@ -16,9 +16,10 @@ #pragma once -#include "velox/exec/Operator.h" #include "velox/experimental/cudf/exec/NvtxHelper.h" #include "velox/experimental/cudf/vector/CudfVector.h" + +#include "velox/exec/Operator.h" #include "velox/vector/ComplexVector.h" #include diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 90cb5988fef..3be5cfe97fd 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -14,7 +14,11 @@ * limitations under the License. */ +#include "velox/experimental/cudf/exec/CudfConversion.h" +#include "velox/experimental/cudf/exec/CudfOrderBy.h" #include "velox/experimental/cudf/exec/ToCudf.h" +#include "velox/experimental/cudf/exec/Utilities.h" + #include "velox/exec/Driver.h" #include "velox/exec/FilterProject.h" #include "velox/exec/HashAggregation.h" @@ -22,9 +26,6 @@ #include "velox/exec/HashProbe.h" #include "velox/exec/Operator.h" #include "velox/exec/OrderBy.h" -#include "velox/experimental/cudf/exec/CudfConversion.h" -#include "velox/experimental/cudf/exec/CudfOrderBy.h" -#include "velox/experimental/cudf/exec/Utilities.h" #include diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp index 19c2e2b9646..dbf49c65b89 100644 --- a/velox/experimental/cudf/exec/Utilities.cpp +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -16,7 +16,11 @@ #include "velox/experimental/cudf/exec/Utilities.h" -#include +#include +#include +#include +#include +#include #include #include @@ -26,11 +30,7 @@ #include #include -#include -#include -#include -#include -#include +#include #include #include diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h index e3480c7d41c..1e5912359b5 100644 --- a/velox/experimental/cudf/exec/Utilities.h +++ b/velox/experimental/cudf/exec/Utilities.h @@ -16,15 +16,16 @@ #pragma once -#include -#include - #include "velox/experimental/cudf/vector/CudfVector.h" #include #include + #include +#include +#include + namespace facebook::velox::cudf_velox { /** diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index cf0f202da6c..bc86939e996 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -14,6 +14,10 @@ * limitations under the License. */ +#include "velox/experimental/cudf/exec/NvtxHelper.h" +#include "velox/experimental/cudf/exec/Utilities.h" +#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" + #include "velox/common/memory/Memory.h" #include "velox/type/Type.h" #include "velox/vector/BaseVector.h" @@ -40,17 +44,13 @@ #include #include -#include "velox/experimental/cudf/exec/NvtxHelper.h" -#include "velox/experimental/cudf/exec/Utilities.h" -#include "velox/experimental/cudf/exec/VeloxCudfInterop.h" - -#include -#include - #include #include #include +#include +#include + namespace facebook::velox::cudf_velox { namespace with_arrow { diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp index 86ad291a3ff..d3eb5a75867 100644 --- a/velox/experimental/cudf/tests/OrderByTest.cpp +++ b/velox/experimental/cudf/tests/OrderByTest.cpp @@ -13,9 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include "velox/experimental/cudf/exec/ToCudf.h" +#include "velox/experimental/cudf/exec/Utilities.h" -#include #include "velox/common/base/tests/GTestUtils.h" #include "velox/core/QueryConfig.h" #include "velox/dwio/common/tests/utils/BatchMaker.h" @@ -23,8 +23,9 @@ #include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" -#include "velox/experimental/cudf/exec/ToCudf.h" -#include "velox/experimental/cudf/exec/Utilities.h" + +#include +#include using namespace facebook::velox; using namespace facebook::velox::exec; From 2d679a4e8dac26601b4d72a1e962e26a8070168b Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 20 Mar 2025 17:58:53 +0000 Subject: [PATCH 18/37] Fix style --- velox/experimental/cudf/exec/CMakeLists.txt | 5 +---- velox/experimental/cudf/exec/NvtxHelper.h | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/velox/experimental/cudf/exec/CMakeLists.txt b/velox/experimental/cudf/exec/CMakeLists.txt index 51cd5bde74c..430ce71bca6 100644 --- a/velox/experimental/cudf/exec/CMakeLists.txt +++ b/velox/experimental/cudf/exec/CMakeLists.txt @@ -29,7 +29,4 @@ target_link_libraries( velox_common_base velox_exec) -target_compile_options( - velox_cudf_exec - PRIVATE - -Wno-missing-field-initializers) +target_compile_options(velox_cudf_exec PRIVATE -Wno-missing-field-initializers) diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h index 4e4efca7a08..892b1976365 100644 --- a/velox/experimental/cudf/exec/NvtxHelper.h +++ b/velox/experimental/cudf/exec/NvtxHelper.h @@ -17,6 +17,7 @@ #pragma once #include + #include namespace facebook::velox::cudf_velox { From 757bce934a3f6586f25bfe32eda218c05d0f6a7c Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 21 Mar 2025 08:56:57 +0000 Subject: [PATCH 19/37] Fix naming --- .../experimental/cudf/exec/CudfConversion.cpp | 37 ++++--- velox/experimental/cudf/exec/CudfConversion.h | 2 +- velox/experimental/cudf/exec/CudfOrderBy.cpp | 38 +++---- velox/experimental/cudf/exec/CudfOrderBy.h | 6 +- velox/experimental/cudf/exec/NvtxHelper.h | 14 +-- velox/experimental/cudf/exec/ToCudf.cpp | 102 +++++++++--------- velox/experimental/cudf/exec/Utilities.cpp | 32 +++--- velox/experimental/cudf/exec/Utilities.h | 2 +- .../cudf/exec/VeloxCudfInterop.cpp | 42 ++++---- .../experimental/cudf/exec/VeloxCudfInterop.h | 8 +- 10 files changed, 141 insertions(+), 142 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index f47f0c239eb..bfe8ff01b26 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -50,13 +50,12 @@ RowVectorPtr mergeRowVectors( return copy; } -cudf::size_type preferred_gpu_batch_size_rows() { - constexpr cudf::size_type default_gpu_batch_size_rows = 100000; - const char* env_cudf_gpu_batch_size_rows = +cudf::size_type preferredGpuBatchSizeRows() { + constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000; + const char* envCudfGpuBatchSizeRows = std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS"); - return env_cudf_gpu_batch_size_rows != nullptr - ? std::stoi(env_cudf_gpu_batch_size_rows) - : default_gpu_batch_size_rows; + return envCudfGpuBatchSizeRows != nullptr ? std::stoi(envCudfGpuBatchSizeRows) + : kDefaultGpuBatchSizeRows; } } // namespace @@ -85,26 +84,26 @@ void CudfFromVelox::addInput(RowVectorPtr input) { // Accumulate inputs inputs_.push_back(input); - current_output_size_ += input->size(); + currentOutputSize_ += input->size(); } } } RowVectorPtr CudfFromVelox::getOutput() { VELOX_NVTX_OPERATOR_FUNC_RANGE(); - const auto target_output_size = preferred_gpu_batch_size_rows(); - const auto exit_early = finished_ or - (current_output_size_ < target_output_size and not noMoreInput_) or + const auto kTargetOutputSize = preferredGpuBatchSizeRows(); + const auto kExitEarly = finished_ or + (currentOutputSize_ < kTargetOutputSize and not noMoreInput_) or inputs_.empty(); finished_ = noMoreInput_; - if (exit_early) { + if (kExitEarly) { return nullptr; } // Combine all input RowVectors into a single RowVector and clear inputs auto input = mergeRowVectors(inputs_, inputs_[0]->pool()); inputs_.clear(); - current_output_size_ = 0; + currentOutputSize_ = 0; // Early return if no input if (input->size() == 0) { @@ -115,16 +114,16 @@ RowVectorPtr CudfFromVelox::getOutput() { auto stream = cudfGlobalStreamPool().get_stream(); // Convert RowVector to cudf table - auto tbl = with_arrow::to_cudf_table(input, input->pool(), stream); + auto tbl = with_arrow::toCudfTable(input, input->pool(), stream); stream.synchronize(); VELOX_CHECK_NOT_NULL(tbl); // Return a CudfVector that owns the cudf table - const auto size = tbl->num_rows(); + const auto kSize = tbl->num_rows(); return std::make_shared( - input->pool(), outputType_, size, std::move(tbl), stream); + input->pool(), outputType_, kSize, std::move(tbl), stream); } void CudfFromVelox::close() { @@ -149,9 +148,9 @@ CudfToVelox::CudfToVelox( void CudfToVelox::addInput(RowVectorPtr input) { // Accumulate inputs if (input->size() > 0) { - auto cudf_input = std::dynamic_pointer_cast(input); - VELOX_CHECK_NOT_NULL(cudf_input); - inputs_.push_back(std::move(cudf_input)); + auto cudfInput = std::dynamic_pointer_cast(input); + VELOX_CHECK_NOT_NULL(cudfInput); + inputs_.push_back(std::move(cudfInput)); } } @@ -171,7 +170,7 @@ RowVectorPtr CudfToVelox::getOutput() { return nullptr; } RowVectorPtr output = - with_arrow::to_velox_column(tbl->view(), pool(), "", stream); + with_arrow::toVeloxColumn(tbl->view(), pool(), "", stream); stream.synchronize(); finished_ = noMoreInput_ && inputs_.empty(); output->setType(outputType_); diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h index 480912256f9..16259aa2f8f 100644 --- a/velox/experimental/cudf/exec/CudfConversion.h +++ b/velox/experimental/cudf/exec/CudfConversion.h @@ -59,7 +59,7 @@ class CudfFromVelox : public exec::Operator, public NvtxHelper { private: std::vector inputs_; - std::size_t current_output_size_ = 0; + std::size_t currentOutputSize_ = 0; bool finished_ = false; }; diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index cb6f34f7dda..c66b535f82a 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -38,22 +38,22 @@ CudfOrderBy::CudfOrderBy( "CudfOrderBy"), NvtxHelper(nvtx3::rgb{64, 224, 208}, operatorId), // Turquoise orderByNode_(orderByNode) { - sort_keys_.reserve(orderByNode->sortingKeys().size()); - column_order_.reserve(orderByNode->sortingKeys().size()); - null_order_.reserve(orderByNode->sortingKeys().size()); + sortKeys_.reserve(orderByNode->sortingKeys().size()); + columnOrder_.reserve(orderByNode->sortingKeys().size()); + nullOrder_.reserve(orderByNode->sortingKeys().size()); for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) { - const auto channel = + const auto kChannel = exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_); VELOX_CHECK( - channel != kConstantChannel, + kChannel != kConstantChannel, "OrderBy doesn't allow constant sorting keys"); - sort_keys_.push_back(channel); - auto const& sorting_order = orderByNode->sortingOrders()[i]; - column_order_.push_back( - sorting_order.isAscending() ? cudf::order::ASCENDING - : cudf::order::DESCENDING); - null_order_.push_back( - (sorting_order.isNullsFirst() ^ !sorting_order.isAscending()) + sortKeys_.push_back(kChannel); + auto const& sortingOrder = orderByNode->sortingOrders()[i]; + columnOrder_.push_back( + sortingOrder.isAscending() ? cudf::order::ASCENDING + : cudf::order::DESCENDING); + nullOrder_.push_back( + (sortingOrder.isNullsFirst() ^ !sortingOrder.isAscending()) ? cudf::null_order::BEFORE : cudf::null_order::AFTER); } @@ -62,9 +62,9 @@ CudfOrderBy::CudfOrderBy( void CudfOrderBy::addInput(RowVectorPtr input) { // Accumulate inputs if (input->size() > 0) { - auto cudf_input = std::dynamic_pointer_cast(input); - VELOX_CHECK_NOT_NULL(cudf_input); - inputs_.push_back(std::move(cudf_input)); + auto cudfInput = std::dynamic_pointer_cast(input); + VELOX_CHECK_NOT_NULL(cudfInput); + inputs_.push_back(std::move(cudfInput)); } } @@ -86,13 +86,13 @@ void CudfOrderBy::noMoreInput() { VELOX_CHECK_NOT_NULL(tbl); - auto keys = tbl->view().select(sort_keys_); + auto keys = tbl->view().select(sortKeys_); auto values = tbl->view(); auto result = - cudf::sort_by_key(values, keys, column_order_, null_order_, stream); - auto const size = result->num_rows(); + cudf::sort_by_key(values, keys, columnOrder_, nullOrder_, stream); + auto const kSize = result->num_rows(); outputTable_ = std::make_shared( - pool(), outputType_, size, std::move(result), stream); + pool(), outputType_, kSize, std::move(result), stream); } RowVectorPtr CudfOrderBy::getOutput() { diff --git a/velox/experimental/cudf/exec/CudfOrderBy.h b/velox/experimental/cudf/exec/CudfOrderBy.h index 29b225e1e2f..75e56315746 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.h +++ b/velox/experimental/cudf/exec/CudfOrderBy.h @@ -57,9 +57,9 @@ class CudfOrderBy : public exec::Operator, public NvtxHelper { CudfVectorPtr outputTable_; std::shared_ptr orderByNode_; std::vector inputs_; - std::vector sort_keys_; - std::vector column_order_; - std::vector null_order_; + std::vector sortKeys_; + std::vector columnOrder_; + std::vector nullOrder_; bool finished_{false}; }; diff --git a/velox/experimental/cudf/exec/NvtxHelper.h b/velox/experimental/cudf/exec/NvtxHelper.h index 892b1976365..dde348e4744 100644 --- a/velox/experimental/cudf/exec/NvtxHelper.h +++ b/velox/experimental/cudf/exec/NvtxHelper.h @@ -35,11 +35,11 @@ class NvtxHelper { /** * @brief Tag type for Velox's NVTX domain. */ -struct velox_domain { +struct VeloxDomain { static constexpr char const* name{"velox"}; }; -using nvtx_registered_string_t = nvtx3::registered_string_in; +using NvtxRegisteredStringT = nvtx3::registered_string_in; #define VELOX_NVTX_OPERATOR_FUNC_RANGE() \ static_assert( \ @@ -47,21 +47,21 @@ using nvtx_registered_string_t = nvtx3::registered_string_in; value, \ "VELOX_NVTX_OPERATOR_FUNC_RANGE can only be used" \ " in Operators derived from NvtxHelper"); \ - static nvtx_registered_string_t const nvtx3_func_name__{ \ + static NvtxRegisteredStringT const nvtx3_func_name__{ \ std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)}; \ static ::nvtx3::event_attributes const nvtx3_func_attr__{ \ this->payload_.has_value() ? \ ::nvtx3::event_attributes{nvtx3_func_name__, this->color_, \ nvtx3::payload{this->payload_.value()}} : \ ::nvtx3::event_attributes{nvtx3_func_name__, this->color_}}; \ - ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; #define VELOX_NVTX_PRETTY_FUNC_RANGE() \ - static nvtx_registered_string_t const nvtx3_func_name__{ \ + static NvtxRegisteredStringT const nvtx3_func_name__{ \ std::string(__func__) + " " + std::string(__PRETTY_FUNCTION__)}; \ static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ - ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + ::nvtx3::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; -#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(velox_domain) +#define VELOX_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(VeloxDomain) } // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 3be5cfe97fd..c64886d76db 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -38,14 +38,12 @@ namespace facebook::velox::cudf_velox { namespace { template -bool is_any_of(const Base* p) { +bool isAnyOf(const Base* p) { return ((dynamic_cast(p) != nullptr) || ...); } } // namespace -static bool _cudfIsRegistered = false; - bool CompileState::compile() { auto operators = driver_.operators(); auto& nodes = planNodes_; @@ -54,11 +52,11 @@ bool CompileState::compile() { // them during the transformation. driver_.initializeOperators(); - bool replacements_made = false; + bool replacementsMade = false; auto ctx = driver_.driverCtx(); // Get plan node by id lookup. - auto get_plan_node = [&](const core::PlanNodeId& id) { + auto getPlanNode = [&](const core::PlanNodeId& id) { auto it = std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) { return node->id() == id; @@ -67,84 +65,84 @@ bool CompileState::compile() { return *it; }; - auto is_supported_gpu_operator = [](const exec::Operator* op) { - return is_any_of(op); + auto isSupportedGpuOperator = [](const exec::Operator* op) { + return isAnyOf(op); }; - std::vector is_supported_gpu_operators(operators.size()); + std::vector isSupportedGpuOperators(operators.size()); std::transform( operators.begin(), operators.end(), - is_supported_gpu_operators.begin(), - is_supported_gpu_operator); + isSupportedGpuOperators.begin(), + isSupportedGpuOperator); - auto accepts_gpu_input = [](const exec::Operator* op) { - return is_any_of(op); + auto acceptsGpuInput = [](const exec::Operator* op) { + return isAnyOf(op); }; - auto produces_gpu_output = [](const exec::Operator* op) { - return is_any_of(op); + auto producesGpuOutput = [](const exec::Operator* op) { + return isAnyOf(op); }; int32_t operatorsOffset = 0; for (int32_t operatorIndex = 0; operatorIndex < operators.size(); ++operatorIndex) { - std::vector> replace_op; + std::vector> replaceOp; exec::Operator* oper = operators[operatorIndex]; auto replacingOperatorIndex = operatorIndex + operatorsOffset; VELOX_CHECK(oper); - bool const previous_operator_is_not_gpu = - (operatorIndex > 0 and !is_supported_gpu_operators[operatorIndex - 1]); - bool const next_operator_is_not_gpu = + bool const kPreviousOperatorIsNotGpu = + (operatorIndex > 0 and !isSupportedGpuOperators[operatorIndex - 1]); + bool const kNextOperatorIsNotGpu = (operatorIndex < operators.size() - 1 and - !is_supported_gpu_operators[operatorIndex + 1]); + !isSupportedGpuOperators[operatorIndex + 1]); auto id = oper->operatorId(); - if (previous_operator_is_not_gpu and accepts_gpu_input(oper)) { - auto plan_node = get_plan_node(oper->planNodeId()); - replace_op.push_back(std::make_unique( - id, plan_node->outputType(), ctx, plan_node->id() + "-from-velox")); - replace_op.back()->initialize(); + if (kPreviousOperatorIsNotGpu and acceptsGpuInput(oper)) { + auto planNode = getPlanNode(oper->planNodeId()); + replaceOp.push_back(std::make_unique( + id, planNode->outputType(), ctx, planNode->id() + "-from-velox")); + replaceOp.back()->initialize(); } if (auto orderByOp = dynamic_cast(oper)) { auto id = orderByOp->operatorId(); - auto plan_node = std::dynamic_pointer_cast( - get_plan_node(orderByOp->planNodeId())); - VELOX_CHECK(plan_node != nullptr); - replace_op.push_back(std::make_unique(id, ctx, plan_node)); - replace_op.back()->initialize(); + auto planNode = std::dynamic_pointer_cast( + getPlanNode(orderByOp->planNodeId())); + VELOX_CHECK(planNode != nullptr); + replaceOp.push_back(std::make_unique(id, ctx, planNode)); + replaceOp.back()->initialize(); } - if (next_operator_is_not_gpu and produces_gpu_output(oper)) { - auto plan_node = get_plan_node(oper->planNodeId()); - replace_op.push_back(std::make_unique( - id, plan_node->outputType(), ctx, plan_node->id() + "-to-velox")); - replace_op.back()->initialize(); + if (kNextOperatorIsNotGpu and producesGpuOutput(oper)) { + auto planNode = getPlanNode(oper->planNodeId()); + replaceOp.push_back(std::make_unique( + id, planNode->outputType(), ctx, planNode->id() + "-to-velox")); + replaceOp.back()->initialize(); } - if (not replace_op.empty()) { - operatorsOffset += replace_op.size() - 1; + if (not replaceOp.empty()) { + operatorsOffset += replaceOp.size() - 1; [[maybe_unused]] auto replaced = driverFactory_.replaceOperators( driver_, replacingOperatorIndex, replacingOperatorIndex + 1, - std::move(replace_op)); - replacements_made = true; + std::move(replaceOp)); + replacementsMade = true; } } - return replacements_made; + return replacementsMade; } -struct cudfDriverAdapter { +struct CudfDriverAdapter { std::shared_ptr mr_; std::shared_ptr>> planNodes_; - cudfDriverAdapter(std::shared_ptr mr) + CudfDriverAdapter(std::shared_ptr mr) : mr_(mr) { planNodes_ = std::make_shared>>(); @@ -178,27 +176,29 @@ struct cudfDriverAdapter { } }; +static bool isCudfRegistered = false; + void registerCudf() { if (cudfIsRegistered()) { return; } - const char* env_cudf_disabled = std::getenv("VELOX_CUDF_DISABLED"); - if (env_cudf_disabled != nullptr && std::stoi(env_cudf_disabled)) { + const char* envCudfDisabled = std::getenv("VELOX_CUDF_DISABLED"); + if (envCudfDisabled != nullptr && std::stoi(envCudfDisabled)) { return; } CUDF_FUNC_RANGE(); - cudaFree(0); // Initialize CUDA context at startup + cudaFree(nullptr); // Initialize CUDA context at startup - const char* env_cudf_mr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); - auto mr_mode = env_cudf_mr != nullptr ? env_cudf_mr : "async"; - auto mr = cudf_velox::create_memory_resource(mr_mode); + const char* envCudfMr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); + auto mrMode = envCudfMr != nullptr ? envCudfMr : "async"; + auto mr = cudf_velox::createMemoryResource(mrMode); cudf::set_current_device_resource(mr.get()); - cudfDriverAdapter cda{mr}; + CudfDriverAdapter cda{mr}; exec::DriverAdapter cudfAdapter{"cuDF", cda, cda}; exec::DriverFactory::registerAdapter(cudfAdapter); - _cudfIsRegistered = true; + isCudfRegistered = true; } void unregisterCudf() { @@ -211,11 +211,11 @@ void unregisterCudf() { }), exec::DriverFactory::adapters.end()); - _cudfIsRegistered = false; + isCudfRegistered = false; } bool cudfIsRegistered() { - return _cudfIsRegistered; + return isCudfRegistered; } } // namespace facebook::velox::cudf_velox diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp index dbf49c65b89..99a597f2e3f 100644 --- a/velox/experimental/cudf/exec/Utilities.cpp +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -39,48 +39,48 @@ namespace facebook::velox::cudf_velox { namespace { -[[nodiscard]] auto make_cuda_mr() { +[[nodiscard]] auto makeCudaMr() { return std::make_shared(); } -[[nodiscard]] auto make_pool_mr() { +[[nodiscard]] auto makePoolMr() { return rmm::mr::make_owning_wrapper( - make_cuda_mr(), rmm::percent_of_free_device_memory(50)); + makeCudaMr(), rmm::percent_of_free_device_memory(50)); } -[[nodiscard]] auto make_async_mr() { +[[nodiscard]] auto makeAsyncMr() { return std::make_shared(); } -[[nodiscard]] auto make_managed_mr() { +[[nodiscard]] auto makeManagedMr() { return std::make_shared(); } -[[nodiscard]] auto make_arena_mr() { +[[nodiscard]] auto makeArenaMr() { return rmm::mr::make_owning_wrapper( - make_cuda_mr()); + makeCudaMr()); } -[[nodiscard]] auto make_managed_pool_mr() { +[[nodiscard]] auto makeManagedPoolMr() { return rmm::mr::make_owning_wrapper( - make_managed_mr(), rmm::percent_of_free_device_memory(50)); + makeManagedMr(), rmm::percent_of_free_device_memory(50)); } } // namespace -std::shared_ptr create_memory_resource( +std::shared_ptr createMemoryResource( std::string_view mode) { if (mode == "cuda") - return make_cuda_mr(); + return makeCudaMr(); if (mode == "pool") - return make_pool_mr(); + return makePoolMr(); if (mode == "async") - return make_async_mr(); + return makeAsyncMr(); if (mode == "arena") - return make_arena_mr(); + return makeArenaMr(); if (mode == "managed") - return make_managed_mr(); + return makeManagedMr(); if (mode == "managed_pool") - return make_managed_pool_mr(); + return makeManagedPoolMr(); throw cudf::logic_error( "Unknown memory resource mode: " + std::string(mode) + "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); diff --git a/velox/experimental/cudf/exec/Utilities.h b/velox/experimental/cudf/exec/Utilities.h index 1e5912359b5..31f529bb34a 100644 --- a/velox/experimental/cudf/exec/Utilities.h +++ b/velox/experimental/cudf/exec/Utilities.h @@ -32,7 +32,7 @@ namespace facebook::velox::cudf_velox { * @brief Creates a memory resource based on the given mode. */ [[nodiscard]] std::shared_ptr -create_memory_resource(std::string_view mode); +createMemoryResource(std::string_view mode); /** * @brief Returns the global CUDA stream pool used by cudf. diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index bc86939e996..8a2d6a3f46c 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -55,7 +55,7 @@ namespace facebook::velox::cudf_velox { namespace with_arrow { -std::unique_ptr to_cudf_table( +std::unique_ptr toCudfTable( const facebook::velox::RowVectorPtr& veloxTable, facebook::velox::memory::MemoryPool* pool, rmm::cuda_stream_view stream) { @@ -87,7 +87,7 @@ std::unique_ptr to_cudf_table( namespace { -void to_signed_int_format(char* format) { +void toSignedIntFormat(char* format) { VELOX_CHECK_NOT_NULL(format); switch (format[0]) { case 'C': @@ -112,18 +112,18 @@ void to_signed_int_format(char* format) { // Changes all unsigned indices to signed indices for dictionary columns from // cudf which uses unsigned indices, but velox uses signed indices. -void fix_dictionary_indices(ArrowSchema& arrowSchema) { +void fixDictionaryIndices(ArrowSchema& arrowSchema) { if (arrowSchema.dictionary != nullptr) { - to_signed_int_format(const_cast(arrowSchema.format)); - fix_dictionary_indices(*arrowSchema.dictionary); + toSignedIntFormat(const_cast(arrowSchema.format)); + fixDictionaryIndices(*arrowSchema.dictionary); } for (size_t i = 0; i < arrowSchema.n_children; ++i) { VELOX_CHECK_NOT_NULL(arrowSchema.children[i]); - fix_dictionary_indices(*arrowSchema.children[i]); + fixDictionaryIndices(*arrowSchema.children[i]); } } -RowVectorPtr to_velox_column( +RowVectorPtr toVeloxColumn( const cudf::table_view& table, memory::MemoryPool* pool, const std::vector& metadata, @@ -133,25 +133,25 @@ RowVectorPtr to_velox_column( auto arrowSchema = cudf::to_arrow_schema(table, metadata); // Hack to convert unsigned indices to signed indices for dictionary columns - fix_dictionary_indices(*arrowSchema); + fixDictionaryIndices(*arrowSchema); auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool); // BaseVector to RowVector - auto casted_ptr = + auto castedPtr = std::dynamic_pointer_cast(veloxTable); - VELOX_CHECK_NOT_NULL(casted_ptr); - return casted_ptr; + VELOX_CHECK_NOT_NULL(castedPtr); + return castedPtr; } template std::vector -get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) { +getMetadata(Iterator begin, Iterator end, const std::string& namePrefix) { std::vector metadata; int i = 0; for (auto c = begin; c < end; c++) { - metadata.push_back(cudf::column_metadata(name_prefix + std::to_string(i))); - metadata.back().children_meta = get_metadata( - c->child_begin(), c->child_end(), name_prefix + std::to_string(i)); + metadata.push_back(cudf::column_metadata(namePrefix + std::to_string(i))); + metadata.back().children_meta = getMetadata( + c->child_begin(), c->child_end(), namePrefix + std::to_string(i)); i++; } return metadata; @@ -159,16 +159,16 @@ get_metadata(Iterator begin, Iterator end, const std::string& name_prefix) { } // namespace -facebook::velox::RowVectorPtr to_velox_column( +facebook::velox::RowVectorPtr toVeloxColumn( const cudf::table_view& table, facebook::velox::memory::MemoryPool* pool, - std::string name_prefix, + std::string namePrefix, rmm::cuda_stream_view stream) { - auto metadata = get_metadata(table.begin(), table.end(), name_prefix); - return to_velox_column(table, pool, metadata, stream); + auto metadata = getMetadata(table.begin(), table.end(), namePrefix); + return toVeloxColumn(table, pool, metadata, stream); } -RowVectorPtr to_velox_column( +RowVectorPtr toVeloxColumn( const cudf::table_view& table, memory::MemoryPool* pool, const std::vector& columnNames, @@ -177,7 +177,7 @@ RowVectorPtr to_velox_column( for (auto name : columnNames) { metadata.emplace_back(cudf::column_metadata(name)); } - return to_velox_column(table, pool, metadata, stream); + return toVeloxColumn(table, pool, metadata, stream); } } // namespace with_arrow diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.h b/velox/experimental/cudf/exec/VeloxCudfInterop.h index fbb4eda355e..529045245f7 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.h +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.h @@ -25,18 +25,18 @@ #include namespace facebook::velox::cudf_velox::with_arrow { -std::unique_ptr to_cudf_table( +std::unique_ptr toCudfTable( const facebook::velox::RowVectorPtr& veloxTable, facebook::velox::memory::MemoryPool* pool, rmm::cuda_stream_view stream); -facebook::velox::RowVectorPtr to_velox_column( +facebook::velox::RowVectorPtr toVeloxColumn( const cudf::table_view& table, facebook::velox::memory::MemoryPool* pool, - std::string name_prefix, + std::string namePrefix, rmm::cuda_stream_view stream); -facebook::velox::RowVectorPtr to_velox_column( +facebook::velox::RowVectorPtr toVeloxColumn( const cudf::table_view& table, facebook::velox::memory::MemoryPool* pool, const std::vector& columnNames, From 436d72138e22d7c6c90b418d8e707c552844ca0a Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 21 Mar 2025 15:47:26 +0000 Subject: [PATCH 20/37] Fix more style --- velox/experimental/cudf/exec/CudfConversion.cpp | 12 ++++++------ velox/experimental/cudf/exec/CudfOrderBy.cpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index bfe8ff01b26..e0b557d4c2f 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -91,12 +91,12 @@ void CudfFromVelox::addInput(RowVectorPtr input) { RowVectorPtr CudfFromVelox::getOutput() { VELOX_NVTX_OPERATOR_FUNC_RANGE(); - const auto kTargetOutputSize = preferredGpuBatchSizeRows(); - const auto kExitEarly = finished_ or - (currentOutputSize_ < kTargetOutputSize and not noMoreInput_) or + const auto targetOutputSize = preferredGpuBatchSizeRows(); + const auto exitEarly = finished_ or + (currentOutputSize_ < targetOutputSize and not noMoreInput_) or inputs_.empty(); finished_ = noMoreInput_; - if (kExitEarly) { + if (exitEarly) { return nullptr; } @@ -121,9 +121,9 @@ RowVectorPtr CudfFromVelox::getOutput() { VELOX_CHECK_NOT_NULL(tbl); // Return a CudfVector that owns the cudf table - const auto kSize = tbl->num_rows(); + const auto size = tbl->num_rows(); return std::make_shared( - input->pool(), outputType_, kSize, std::move(tbl), stream); + input->pool(), outputType_, size, std::move(tbl), stream); } void CudfFromVelox::close() { diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index c66b535f82a..061c2037737 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -42,12 +42,12 @@ CudfOrderBy::CudfOrderBy( columnOrder_.reserve(orderByNode->sortingKeys().size()); nullOrder_.reserve(orderByNode->sortingKeys().size()); for (int i = 0; i < orderByNode->sortingKeys().size(); ++i) { - const auto kChannel = + const auto channel = exec::exprToChannel(orderByNode->sortingKeys()[i].get(), outputType_); VELOX_CHECK( - kChannel != kConstantChannel, + channel != kConstantChannel, "OrderBy doesn't allow constant sorting keys"); - sortKeys_.push_back(kChannel); + sortKeys_.push_back(channel); auto const& sortingOrder = orderByNode->sortingOrders()[i]; columnOrder_.push_back( sortingOrder.isAscending() ? cudf::order::ASCENDING From 5538888497d94d68acfecd73cae54c81a88a5a76 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 21 Mar 2025 16:27:35 +0000 Subject: [PATCH 21/37] Error out when cuda architecture is less than 70 --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b93de09121f..b7edb668b3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -390,6 +390,14 @@ if(VELOX_ENABLE_GPU) endif() find_package(CUDAToolkit REQUIRED) if(VELOX_ENABLE_CUDF) + foreach(arch ${CMAKE_CUDA_ARCHITECTURES}) + if(arch LESS 70) + message( + FATAL_ERROR + "CUDA architecture ${arch} is below 70. CUDF requires Volta (SM 70) or newer GPUs." + ) + endif() + endforeach() set(VELOX_ENABLE_ARROW ON) velox_set_source(cudf) velox_resolve_dependency(cudf) From 71624fae3741cb490f3f0ef6151cf3092829ce56 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 25 Mar 2025 08:27:44 +0000 Subject: [PATCH 22/37] Misc. review changes - rename std::shared_ptr to PlanNodePtr - rename kSize -> size - formalize kCudfAdapterName --- velox/experimental/cudf/exec/CudfOrderBy.cpp | 4 ++-- velox/experimental/cudf/exec/ToCudf.cpp | 10 ++++------ velox/experimental/cudf/exec/ToCudf.h | 6 ++++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp index 061c2037737..e1f2a011147 100644 --- a/velox/experimental/cudf/exec/CudfOrderBy.cpp +++ b/velox/experimental/cudf/exec/CudfOrderBy.cpp @@ -90,9 +90,9 @@ void CudfOrderBy::noMoreInput() { auto values = tbl->view(); auto result = cudf::sort_by_key(values, keys, columnOrder_, nullOrder_, stream); - auto const kSize = result->num_rows(); + auto const size = result->num_rows(); outputTable_ = std::make_shared( - pool(), outputType_, kSize, std::move(result), stream); + pool(), outputType_, size, std::move(result), stream); } RowVectorPtr CudfOrderBy::getOutput() { diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index c64886d76db..bbf652b6876 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -139,13 +139,11 @@ bool CompileState::compile() { struct CudfDriverAdapter { std::shared_ptr mr_; - std::shared_ptr>> - planNodes_; + std::shared_ptr> planNodes_; CudfDriverAdapter(std::shared_ptr mr) : mr_(mr) { - planNodes_ = - std::make_shared>>(); + planNodes_ = std::make_shared>(); } // Call operator needed by DriverAdapter @@ -196,7 +194,7 @@ void registerCudf() { auto mr = cudf_velox::createMemoryResource(mrMode); cudf::set_current_device_resource(mr.get()); CudfDriverAdapter cda{mr}; - exec::DriverAdapter cudfAdapter{"cuDF", cda, cda}; + exec::DriverAdapter cudfAdapter{kCudfAdapterName, cda, cda}; exec::DriverFactory::registerAdapter(cudfAdapter); isCudfRegistered = true; } @@ -207,7 +205,7 @@ void unregisterCudf() { exec::DriverFactory::adapters.begin(), exec::DriverFactory::adapters.end(), [](const exec::DriverAdapter& adapter) { - return adapter.label == "cuDF"; + return adapter.label == kCudfAdapterName; }), exec::DriverFactory::adapters.end()); diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h index 8da0eba26ae..f19dd0d107f 100644 --- a/velox/experimental/cudf/exec/ToCudf.h +++ b/velox/experimental/cudf/exec/ToCudf.h @@ -21,12 +21,14 @@ namespace facebook::velox::cudf_velox { +static const std::string kCudfAdapterName = "cuDF"; + class CompileState { public: CompileState( const exec::DriverFactory& driverFactory, exec::Driver& driver, - std::vector>& planNodes) + std::vector& planNodes) : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {} exec::Driver& driver() { @@ -39,7 +41,7 @@ class CompileState { const exec::DriverFactory& driverFactory_; exec::Driver& driver_; - const std::vector>& planNodes_; + const std::vector& planNodes_; }; /// Registers adapter to add cuDF operators to Drivers. From 6ccdb579b57298aaa5de1b093ff63fa51e70639b Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 25 Mar 2025 08:38:44 +0000 Subject: [PATCH 23/37] Misc review changes --- .../experimental/cudf/exec/CudfConversion.cpp | 20 +++++++++---------- velox/experimental/cudf/exec/ToCudf.cpp | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index e0b557d4c2f..3de24db7896 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -74,18 +74,16 @@ CudfFromVelox::CudfFromVelox( void CudfFromVelox::addInput(RowVectorPtr input) { VELOX_NVTX_OPERATOR_FUNC_RANGE(); - if (input != nullptr) { - if (input->size() > 0) { - // Materialize lazy vectors - for (auto& child : input->children()) { - child->loadedVector(); - } - input->loadedVector(); - - // Accumulate inputs - inputs_.push_back(input); - currentOutputSize_ += input->size(); + if (input->size() > 0) { + // Materialize lazy vectors + for (auto& child : input->children()) { + child->loadedVector(); } + input->loadedVector(); + + // Accumulate inputs + inputs_.push_back(input); + currentOutputSize_ += input->size(); } } diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index bbf652b6876..1ed61296639 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -155,7 +155,7 @@ struct CudfDriverAdapter { } // Iterate recursively and store them in the planNodes_. - void storePlanNodes(const std::shared_ptr& planNode) { + void storePlanNodes(const core::PlanNodePtr& planNode) { const auto& sources = planNode->sources(); for (int32_t i = 0; i < sources.size(); ++i) { storePlanNodes(sources[i]); From ab399f22c8393e3e9a99a60c46fd94e8f2edbb88 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 25 Mar 2025 12:03:08 +0000 Subject: [PATCH 24/37] Prevent merging vectors whose total size exceeds vector_size_t max --- .../experimental/cudf/exec/CudfConversion.cpp | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index 3de24db7896..f86589611cb 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -36,7 +36,7 @@ RowVectorPtr mergeRowVectors( const std::vector& results, velox::memory::MemoryPool* pool) { VELOX_NVTX_FUNC_RANGE(); - auto totalCount = 0; + vector_size_t totalCount = 0; for (const auto& result : results) { totalCount += result->size(); } @@ -54,8 +54,15 @@ cudf::size_type preferredGpuBatchSizeRows() { constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000; const char* envCudfGpuBatchSizeRows = std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS"); - return envCudfGpuBatchSizeRows != nullptr ? std::stoi(envCudfGpuBatchSizeRows) - : kDefaultGpuBatchSizeRows; + const auto batchSize = envCudfGpuBatchSizeRows != nullptr + ? std::stoll(envCudfGpuBatchSizeRows) + : kDefaultGpuBatchSizeRows; + VELOX_CHECK_GT(batchSize, 0, "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be > 0"); + VELOX_CHECK_LE( + batchSize, + std::numeric_limits::max(), + "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be <= max(vector_size_t)"); + return batchSize; } } // namespace @@ -90,18 +97,35 @@ void CudfFromVelox::addInput(RowVectorPtr input) { RowVectorPtr CudfFromVelox::getOutput() { VELOX_NVTX_OPERATOR_FUNC_RANGE(); const auto targetOutputSize = preferredGpuBatchSizeRows(); - const auto exitEarly = finished_ or + + finished_ = noMoreInput_ && inputs_.empty(); + + if (finished_ or (currentOutputSize_ < targetOutputSize and not noMoreInput_) or - inputs_.empty(); - finished_ = noMoreInput_; - if (exitEarly) { + inputs_.empty()) { return nullptr; } - // Combine all input RowVectors into a single RowVector and clear inputs - auto input = mergeRowVectors(inputs_, inputs_[0]->pool()); - inputs_.clear(); - currentOutputSize_ = 0; + // Select inputs that don't exceed the max vector size limit + std::vector selectedInputs; + vector_size_t totalSize = 0; + auto const maxVectorSize = std::numeric_limits::max(); + + for (const auto& input : inputs_) { + if (totalSize + input->size() <= maxVectorSize) { + selectedInputs.push_back(input); + totalSize += input->size(); + } else { + break; + } + } + + // Combine selected RowVectors into a single RowVector + auto input = mergeRowVectors(selectedInputs, inputs_[0]->pool()); + + // Remove processed inputs + inputs_.erase(inputs_.begin(), inputs_.begin() + selectedInputs.size()); + currentOutputSize_ -= totalSize; // Early return if no input if (input->size() == 0) { From 327f7180239487e719b43a8dd2e07129abcae3c1 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 25 Mar 2025 13:08:26 +0000 Subject: [PATCH 25/37] Misc review changes - Removing missed kConstant changes when not applicable - auto* instead of auto --- velox/experimental/cudf/exec/ToCudf.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 1ed61296639..60224a71750 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -93,21 +93,21 @@ bool CompileState::compile() { auto replacingOperatorIndex = operatorIndex + operatorsOffset; VELOX_CHECK(oper); - bool const kPreviousOperatorIsNotGpu = + const bool previousOperatorIsNotGpu = (operatorIndex > 0 and !isSupportedGpuOperators[operatorIndex - 1]); - bool const kNextOperatorIsNotGpu = + const bool nextOperatorIsNotGpu = (operatorIndex < operators.size() - 1 and !isSupportedGpuOperators[operatorIndex + 1]); auto id = oper->operatorId(); - if (kPreviousOperatorIsNotGpu and acceptsGpuInput(oper)) { + if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) { auto planNode = getPlanNode(oper->planNodeId()); replaceOp.push_back(std::make_unique( id, planNode->outputType(), ctx, planNode->id() + "-from-velox")); replaceOp.back()->initialize(); } - if (auto orderByOp = dynamic_cast(oper)) { + if (auto* orderByOp = dynamic_cast(oper)) { auto id = orderByOp->operatorId(); auto planNode = std::dynamic_pointer_cast( getPlanNode(orderByOp->planNodeId())); @@ -116,7 +116,7 @@ bool CompileState::compile() { replaceOp.back()->initialize(); } - if (kNextOperatorIsNotGpu and producesGpuOutput(oper)) { + if (nextOperatorIsNotGpu and producesGpuOutput(oper)) { auto planNode = getPlanNode(oper->planNodeId()); replaceOp.push_back(std::make_unique( id, planNode->outputType(), ctx, planNode->id() + "-to-velox")); From 5754b3c6db5372553330476e629d07abd7e04a71 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Mar 2025 01:43:44 -0500 Subject: [PATCH 26/37] replace env variable with gflags and CudfOptions --- velox/experimental/cudf/exec/ToCudf.cpp | 12 ++++++------ velox/experimental/cudf/exec/ToCudf.h | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 60224a71750..85cefa4afaf 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -33,6 +33,9 @@ #include +DEFINE_bool(velox_cudf_enabled, true, "Enable cuDF-Velox acceleration"); +DEFINE_string(velox_cudf_memory_resource, "async", "Memory resource for cuDF"); + namespace facebook::velox::cudf_velox { namespace { @@ -176,21 +179,18 @@ struct CudfDriverAdapter { static bool isCudfRegistered = false; -void registerCudf() { +void registerCudf(const CudfOptions& options) { if (cudfIsRegistered()) { return; } - - const char* envCudfDisabled = std::getenv("VELOX_CUDF_DISABLED"); - if (envCudfDisabled != nullptr && std::stoi(envCudfDisabled)) { + if (!options.cudfEnabled) { return; } CUDF_FUNC_RANGE(); cudaFree(nullptr); // Initialize CUDA context at startup - const char* envCudfMr = std::getenv("VELOX_CUDF_MEMORY_RESOURCE"); - auto mrMode = envCudfMr != nullptr ? envCudfMr : "async"; + const std::string mrMode = options.cudfMemoryResource; auto mr = cudf_velox::createMemoryResource(mrMode); cudf::set_current_device_resource(mr.get()); CudfDriverAdapter cda{mr}; diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h index f19dd0d107f..0f45354d8fa 100644 --- a/velox/experimental/cudf/exec/ToCudf.h +++ b/velox/experimental/cudf/exec/ToCudf.h @@ -19,6 +19,11 @@ #include "velox/exec/Driver.h" #include "velox/exec/Operator.h" +#include + +DECLARE_bool(velox_cudf_enabled); +DECLARE_string(velox_cudf_memory_resource); + namespace facebook::velox::cudf_velox { static const std::string kCudfAdapterName = "cuDF"; @@ -44,8 +49,16 @@ class CompileState { const std::vector& planNodes_; }; +struct CudfOptions { + bool cudfEnabled = FLAGS_velox_cudf_enabled; + std::string cudfMemoryResource = FLAGS_velox_cudf_memory_resource; + static CudfOptions defaultOptions() { + return CudfOptions(); + } +}; + /// Registers adapter to add cuDF operators to Drivers. -void registerCudf(); +void registerCudf(const CudfOptions& options = CudfOptions::defaultOptions()); void unregisterCudf(); /// Returns true if cuDF is registered. From 538398e6f522a3cee0384347fe309737ca16a491 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Mar 2025 01:59:45 -0500 Subject: [PATCH 27/37] replace gpu batch size env variable with a QueryConfig entry --- velox/experimental/cudf/exec/CudfConversion.cpp | 13 ++++++------- velox/experimental/cudf/exec/CudfConversion.h | 2 ++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index f86589611cb..b84dc98f183 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -50,13 +50,11 @@ RowVectorPtr mergeRowVectors( return copy; } -cudf::size_type preferredGpuBatchSizeRows() { +cudf::size_type preferredGpuBatchSizeRows( + const facebook::velox::core::QueryConfig& queryConfig) { constexpr cudf::size_type kDefaultGpuBatchSizeRows = 100000; - const char* envCudfGpuBatchSizeRows = - std::getenv("VELOX_CUDF_GPU_BATCH_SIZE_ROWS"); - const auto batchSize = envCudfGpuBatchSizeRows != nullptr - ? std::stoll(envCudfGpuBatchSizeRows) - : kDefaultGpuBatchSizeRows; + const auto batchSize = queryConfig.get( + CudfFromVelox::kGpuBatchSizeRows, kDefaultGpuBatchSizeRows); VELOX_CHECK_GT(batchSize, 0, "VELOX_CUDF_GPU_BATCH_SIZE_ROWS must be > 0"); VELOX_CHECK_LE( batchSize, @@ -96,7 +94,8 @@ void CudfFromVelox::addInput(RowVectorPtr input) { RowVectorPtr CudfFromVelox::getOutput() { VELOX_NVTX_OPERATOR_FUNC_RANGE(); - const auto targetOutputSize = preferredGpuBatchSizeRows(); + const auto targetOutputSize = + preferredGpuBatchSizeRows(operatorCtx_->driverCtx()->queryConfig()); finished_ = noMoreInput_ && inputs_.empty(); diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h index 16259aa2f8f..8d649d77328 100644 --- a/velox/experimental/cudf/exec/CudfConversion.h +++ b/velox/experimental/cudf/exec/CudfConversion.h @@ -33,6 +33,8 @@ namespace facebook::velox::cudf_velox { class CudfFromVelox : public exec::Operator, public NvtxHelper { public: + static constexpr const char* kGpuBatchSizeRows = "velox.cudf.gpu_batch_size_rows"; + CudfFromVelox( int32_t operatorId, RowTypePtr outputType, From 7570d56d06d70f7211de382161b7ae6d7f19be14 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 26 Mar 2025 08:05:35 +0000 Subject: [PATCH 28/37] Add back optional debug printing of plans --- velox/experimental/cudf/exec/ToCudf.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 85cefa4afaf..8edc9dd6301 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -35,6 +35,7 @@ DEFINE_bool(velox_cudf_enabled, true, "Enable cuDF-Velox acceleration"); DEFINE_string(velox_cudf_memory_resource, "async", "Memory resource for cuDF"); +DEFINE_bool(velox_cudf_debug, false, "Enable debug printing"); namespace facebook::velox::cudf_velox { @@ -51,6 +52,15 @@ bool CompileState::compile() { auto operators = driver_.operators(); auto& nodes = planNodes_; + if (FLAGS_velox_cudf_debug) { + std::cout << "Operators before adapting for cuDF:" << std::endl; + std::cout << "Number of operators: " << operators.size() << std::endl; + for (auto& op : operators) { + std::cout << " Operator: ID " << op->operatorId() << ": " + << op->toString() << std::endl; + } + } + // Make sure operator states are initialized. We will need to inspect some of // them during the transformation. driver_.initializeOperators(); @@ -137,6 +147,16 @@ bool CompileState::compile() { } } + if (FLAGS_velox_cudf_debug) { + std::cout << "Operators after adapting for cuDF:" << std::endl; + operators = driver_.operators(); + std::cout << "Number of new operators: " << operators.size() << std::endl; + for (auto& op : operators) { + std::cout << " Operator: ID " << op->operatorId() << ": " + << op->toString() << std::endl; + } + } + return replacementsMade; } From e6333715e8a61ca31749715a91ac3288096823d0 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 26 Mar 2025 19:20:32 +0000 Subject: [PATCH 29/37] Misc review changes --- velox/experimental/cudf/exec/CudfConversion.h | 3 ++- velox/experimental/cudf/exec/ToCudf.cpp | 8 ++++---- velox/experimental/cudf/exec/Utilities.cpp | 2 +- velox/experimental/cudf/exec/VeloxCudfInterop.cpp | 5 ++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h index 8d649d77328..16ca33d786c 100644 --- a/velox/experimental/cudf/exec/CudfConversion.h +++ b/velox/experimental/cudf/exec/CudfConversion.h @@ -33,7 +33,8 @@ namespace facebook::velox::cudf_velox { class CudfFromVelox : public exec::Operator, public NvtxHelper { public: - static constexpr const char* kGpuBatchSizeRows = "velox.cudf.gpu_batch_size_rows"; + static constexpr const char* kGpuBatchSizeRows = + "velox.cudf.gpu_batch_size_rows"; CudfFromVelox( int32_t operatorId, diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 8edc9dd6301..3768ae53be7 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -53,8 +53,8 @@ bool CompileState::compile() { auto& nodes = planNodes_; if (FLAGS_velox_cudf_debug) { - std::cout << "Operators before adapting for cuDF:" << std::endl; - std::cout << "Number of operators: " << operators.size() << std::endl; + std::cout << "Operators before adapting for cuDF: count [" + << operators.size() << "]" << std::endl; for (auto& op : operators) { std::cout << " Operator: ID " << op->operatorId() << ": " << op->toString() << std::endl; @@ -148,9 +148,9 @@ bool CompileState::compile() { } if (FLAGS_velox_cudf_debug) { - std::cout << "Operators after adapting for cuDF:" << std::endl; operators = driver_.operators(); - std::cout << "Number of new operators: " << operators.size() << std::endl; + std::cout << "Operators after adapting for cuDF: count [" + << operators.size() << "]" << std::endl; for (auto& op : operators) { std::cout << " Operator: ID " << op->operatorId() << ": " << op->toString() << std::endl; diff --git a/velox/experimental/cudf/exec/Utilities.cpp b/velox/experimental/cudf/exec/Utilities.cpp index 99a597f2e3f..f861724ee63 100644 --- a/velox/experimental/cudf/exec/Utilities.cpp +++ b/velox/experimental/cudf/exec/Utilities.cpp @@ -81,7 +81,7 @@ std::shared_ptr createMemoryResource( return makeManagedMr(); if (mode == "managed_pool") return makeManagedPoolMr(); - throw cudf::logic_error( + VELOX_FAIL( "Unknown memory resource mode: " + std::string(mode) + "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); } diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index 8a2d6a3f46c..fe1dd8b25c1 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -105,9 +105,8 @@ void toSignedIntFormat(char* format) { default: return; } - printf( - "Warning: arrowSchema.format: %s, unsigned is treated as signed indices\n", - format); + LOG(WARNING) << "arrowSchema.format: " << format + << ", unsigned is treated as signed indices"; } // Changes all unsigned indices to signed indices for dictionary columns from From 86738ed23d0b6a3f64ace685e49d9c7358fedb32 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 31 Mar 2025 18:24:52 +0000 Subject: [PATCH 30/37] Add clang tidy --- velox/experimental/cudf/.clang-tidy | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 velox/experimental/cudf/.clang-tidy diff --git a/velox/experimental/cudf/.clang-tidy b/velox/experimental/cudf/.clang-tidy new file mode 100644 index 00000000000..ec5b4e40784 --- /dev/null +++ b/velox/experimental/cudf/.clang-tidy @@ -0,0 +1,54 @@ +--- +Checks: > + readability-identifier-naming, + modernize-use-nullptr, + modernize-use-using + +HeaderFilterRegex: '.*' + +WarningsAsErrors: '' + +CheckOptions: + # Naming conventions as explicitly stated in CODING_STYLE.md + - key: readability-identifier-naming.ClassCase + value: CamelCase + - key: readability-identifier-naming.StructCase + value: CamelCase + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.TypeAliasCase + value: CamelCase + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-identifier-naming.FunctionCase + value: camelBack + - key: readability-identifier-naming.VariableCase + value: camelBack + - key: readability-identifier-naming.ParameterCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberSuffix + value: _ + - key: readability-identifier-naming.ProtectedMemberCase + value: camelBack + - key: readability-identifier-naming.ProtectedMemberSuffix + value: _ + - key: readability-identifier-naming.MacroDefinitionCase + value: UPPER_CASE + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.StaticConstantPrefix + value: k + - key: readability-identifier-naming.EnumConstantCase + value: CamelCase + - key: readability-identifier-naming.EnumConstantPrefix + value: k + + # Use nullptr instead of NULL or 0 + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + + # Prefer enum class over enum + - key: modernize-use-using.IgnoreUsingStdAllocator + value: 1 \ No newline at end of file From 7b18528e7eb71b9df087d23a7e125524d916ce5f Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 31 Mar 2025 18:46:47 +0000 Subject: [PATCH 31/37] remove aacidental flags added to all of velox --- CMake/resolve_dependency_modules/cudf.cmake | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake index 4c9d015dbf1..e300a87adf3 100644 --- a/CMake/resolve_dependency_modules/cudf.cmake +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -53,13 +53,6 @@ set(BUILD_TESTS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(BUILD_SHARED_LIBS ON) -# cudf sets all warnings as errors, and therefore fails to compile with velox -# expanded set of warnings. We selectively disable problematic warnings just for -# cudf -string( - APPEND CMAKE_CXX_FLAGS - " -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-deprecated-copy") - FetchContent_Declare( rapids-cmake URL ${VELOX_rapids_cmake_SOURCE_URL} @@ -87,5 +80,13 @@ FetchContent_Declare( UPDATE_DISCONNECTED 1) FetchContent_MakeAvailable(cudf) + +# cudf sets all warnings as errors, and therefore fails to compile with velox +# expanded set of warnings. We selectively disable problematic warnings just for +# cudf +target_compile_options( + cudf PRIVATE -Wno-non-virtual-dtor -Wno-missing-field-initializers + -Wno-deprecated-copy) + unset(BUILD_SHARED_LIBS) endblock() From 7f723dbedf4016dd53d033b9b264e68a4eb19a64 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 7 Apr 2025 11:21:22 +0000 Subject: [PATCH 32/37] cmake min required --- CMake/resolve_dependency_modules/cudf.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake index e300a87adf3..56f7b9bf1d1 100644 --- a/CMake/resolve_dependency_modules/cudf.cmake +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -14,6 +14,9 @@ include_guard(GLOBAL) +# 3.30.4 is the minimum version required by cudf +cmake_minimum_required(VERSION 3.30.4) + set(VELOX_rapids_cmake_VERSION 25.04) set(VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM 458c14eaff9000067b32d65c8c914f4521090ede7690e16eb57035ce731386db) From bee2494bd8bbf4819f8282d0cf0888aec7d0a4dd Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 7 Apr 2025 14:45:06 +0000 Subject: [PATCH 33/37] Make sure last operator from task produces velox RowVector --- velox/experimental/cudf/exec/ToCudf.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 3768ae53be7..98f91ab7cd3 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -111,6 +111,7 @@ bool CompileState::compile() { const bool nextOperatorIsNotGpu = (operatorIndex < operators.size() - 1 and !isSupportedGpuOperators[operatorIndex + 1]); + const bool isLastOperatorOfTask = oper->planNodeId() == nodes.back()->id(); auto id = oper->operatorId(); if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) { @@ -129,7 +130,8 @@ bool CompileState::compile() { replaceOp.back()->initialize(); } - if (nextOperatorIsNotGpu and producesGpuOutput(oper)) { + if (producesGpuOutput(oper) and + (isLastOperatorOfTask or nextOperatorIsNotGpu)) { auto planNode = getPlanNode(oper->planNodeId()); replaceOp.push_back(std::make_unique( id, planNode->outputType(), ctx, planNode->id() + "-to-velox")); From 55752056c9b9f1313c9328ef7c134ec2a00382ad Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 8 Apr 2025 12:56:29 +0000 Subject: [PATCH 34/37] Cudf driver adapter without storing plan nodes --- velox/experimental/cudf/exec/ToCudf.cpp | 42 ++++++------------------- velox/experimental/cudf/exec/ToCudf.h | 9 ++---- 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 98f91ab7cd3..46219012eb1 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -50,7 +50,6 @@ bool isAnyOf(const Base* p) { bool CompileState::compile() { auto operators = driver_.operators(); - auto& nodes = planNodes_; if (FLAGS_velox_cudf_debug) { std::cout << "Operators before adapting for cuDF: count [" @@ -70,12 +69,16 @@ bool CompileState::compile() { // Get plan node by id lookup. auto getPlanNode = [&](const core::PlanNodeId& id) { + auto& nodes = driverFactory_.planNodes; auto it = std::find_if(nodes.cbegin(), nodes.cend(), [&id](const auto& node) { return node->id() == id; }); - VELOX_CHECK(it != nodes.end()); - return *it; + if (it != nodes.end()) { + return *it; + } + VELOX_CHECK(driverFactory_.consumerNode->id() == id); + return driverFactory_.consumerNode; }; auto isSupportedGpuOperator = [](const exec::Operator* op) { @@ -111,7 +114,6 @@ bool CompileState::compile() { const bool nextOperatorIsNotGpu = (operatorIndex < operators.size() - 1 and !isSupportedGpuOperators[operatorIndex + 1]); - const bool isLastOperatorOfTask = oper->planNodeId() == nodes.back()->id(); auto id = oper->operatorId(); if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) { @@ -130,8 +132,7 @@ bool CompileState::compile() { replaceOp.back()->initialize(); } - if (producesGpuOutput(oper) and - (isLastOperatorOfTask or nextOperatorIsNotGpu)) { + if (producesGpuOutput(oper) and nextOperatorIsNotGpu) { auto planNode = getPlanNode(oper->planNodeId()); replaceOp.push_back(std::make_unique( id, planNode->outputType(), ctx, planNode->id() + "-to-velox")); @@ -164,39 +165,16 @@ bool CompileState::compile() { struct CudfDriverAdapter { std::shared_ptr mr_; - std::shared_ptr> planNodes_; CudfDriverAdapter(std::shared_ptr mr) - : mr_(mr) { - planNodes_ = std::make_shared>(); - } + : mr_(mr) {} // Call operator needed by DriverAdapter bool operator()(const exec::DriverFactory& factory, exec::Driver& driver) { - auto state = CompileState(factory, driver, *planNodes_); - // Stored planNodes_ from inspect. + auto state = CompileState(factory, driver); auto res = state.compile(); return res; } - - // Iterate recursively and store them in the planNodes_. - void storePlanNodes(const core::PlanNodePtr& planNode) { - const auto& sources = planNode->sources(); - for (int32_t i = 0; i < sources.size(); ++i) { - storePlanNodes(sources[i]); - } - planNodes_->push_back(planNode); - } - - // Call operator needed by plan inspection - void operator()(const core::PlanFragment& planFragment) { - // signature: std::function inspect; - // call: adapter.inspect(planFragment); - planNodes_->clear(); - if (planNodes_) { - storePlanNodes(planFragment.planNode); - } - } }; static bool isCudfRegistered = false; @@ -216,7 +194,7 @@ void registerCudf(const CudfOptions& options) { auto mr = cudf_velox::createMemoryResource(mrMode); cudf::set_current_device_resource(mr.get()); CudfDriverAdapter cda{mr}; - exec::DriverAdapter cudfAdapter{kCudfAdapterName, cda, cda}; + exec::DriverAdapter cudfAdapter{kCudfAdapterName, {}, cda}; exec::DriverFactory::registerAdapter(cudfAdapter); isCudfRegistered = true; } diff --git a/velox/experimental/cudf/exec/ToCudf.h b/velox/experimental/cudf/exec/ToCudf.h index 0f45354d8fa..63fcf0d5dd7 100644 --- a/velox/experimental/cudf/exec/ToCudf.h +++ b/velox/experimental/cudf/exec/ToCudf.h @@ -23,6 +23,7 @@ DECLARE_bool(velox_cudf_enabled); DECLARE_string(velox_cudf_memory_resource); +DECLARE_bool(velox_cudf_debug); namespace facebook::velox::cudf_velox { @@ -30,11 +31,8 @@ static const std::string kCudfAdapterName = "cuDF"; class CompileState { public: - CompileState( - const exec::DriverFactory& driverFactory, - exec::Driver& driver, - std::vector& planNodes) - : driverFactory_(driverFactory), driver_(driver), planNodes_(planNodes) {} + CompileState(const exec::DriverFactory& driverFactory, exec::Driver& driver) + : driverFactory_(driverFactory), driver_(driver) {} exec::Driver& driver() { return driver_; @@ -46,7 +44,6 @@ class CompileState { const exec::DriverFactory& driverFactory_; exec::Driver& driver_; - const std::vector& planNodes_; }; struct CudfOptions { From 01f1aaf76df6d384bc025d13fd7095b342843f5a Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 8 Apr 2025 20:01:48 +0000 Subject: [PATCH 35/37] re-fix conversion to RowVector in sink --- velox/experimental/cudf/exec/ToCudf.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/velox/experimental/cudf/exec/ToCudf.cpp b/velox/experimental/cudf/exec/ToCudf.cpp index 46219012eb1..55fb8a27849 100644 --- a/velox/experimental/cudf/exec/ToCudf.cpp +++ b/velox/experimental/cudf/exec/ToCudf.cpp @@ -114,6 +114,8 @@ bool CompileState::compile() { const bool nextOperatorIsNotGpu = (operatorIndex < operators.size() - 1 and !isSupportedGpuOperators[operatorIndex + 1]); + const bool isLastOperatorOfTask = + driverFactory_.outputDriver and operatorIndex == operators.size() - 1; auto id = oper->operatorId(); if (previousOperatorIsNotGpu and acceptsGpuInput(oper)) { @@ -132,7 +134,8 @@ bool CompileState::compile() { replaceOp.back()->initialize(); } - if (producesGpuOutput(oper) and nextOperatorIsNotGpu) { + if (producesGpuOutput(oper) and + (nextOperatorIsNotGpu or isLastOperatorOfTask)) { auto planNode = getPlanNode(oper->planNodeId()); replaceOp.push_back(std::make_unique( id, planNode->outputType(), ctx, planNode->id() + "-to-velox")); From 652825fdc557367bbffb737ee4b4b68c899ab368 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 9 Apr 2025 18:00:21 -0500 Subject: [PATCH 36/37] remove fixDictionaryIndices removing fixDictionaryIndices after cudf changed dictionary indices to signed --- .../cudf/exec/VeloxCudfInterop.cpp | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index fe1dd8b25c1..7a2ad969397 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -87,41 +87,6 @@ std::unique_ptr toCudfTable( namespace { -void toSignedIntFormat(char* format) { - VELOX_CHECK_NOT_NULL(format); - switch (format[0]) { - case 'C': - format[0] = 'c'; - break; - case 'S': - format[0] = 's'; - break; - case 'I': - format[0] = 'i'; - break; - case 'L': - format[0] = 'l'; - break; - default: - return; - } - LOG(WARNING) << "arrowSchema.format: " << format - << ", unsigned is treated as signed indices"; -} - -// Changes all unsigned indices to signed indices for dictionary columns from -// cudf which uses unsigned indices, but velox uses signed indices. -void fixDictionaryIndices(ArrowSchema& arrowSchema) { - if (arrowSchema.dictionary != nullptr) { - toSignedIntFormat(const_cast(arrowSchema.format)); - fixDictionaryIndices(*arrowSchema.dictionary); - } - for (size_t i = 0; i < arrowSchema.n_children; ++i) { - VELOX_CHECK_NOT_NULL(arrowSchema.children[i]); - fixDictionaryIndices(*arrowSchema.children[i]); - } -} - RowVectorPtr toVeloxColumn( const cudf::table_view& table, memory::MemoryPool* pool, @@ -131,9 +96,6 @@ RowVectorPtr toVeloxColumn( auto& arrowArray = arrowDeviceArray->array; auto arrowSchema = cudf::to_arrow_schema(table, metadata); - // Hack to convert unsigned indices to signed indices for dictionary columns - fixDictionaryIndices(*arrowSchema); - auto veloxTable = importFromArrowAsOwner(*arrowSchema, arrowArray, pool); // BaseVector to RowVector auto castedPtr = From 13a4a8769ca25ca8d148be606770fbd05ac5ac46 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 11 Apr 2025 07:50:26 +0000 Subject: [PATCH 37/37] Remove codeowners for now --- .github/CODEOWNERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 649e3694e27..c24be384421 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -32,9 +32,6 @@ scripts/ @assignUser @majetideepak # Breeze velox/experimental/breeze @dreveman -# cuDF -velox/experimental/cudf @bdice @karthikeyann @devavret - # Parquet velox/dwio/parquet/ @majetideepak