diff --git a/.asf.yaml b/.asf.yaml index ba325c2abf2..f3a8ed9fee9 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -23,7 +23,6 @@ github: - benibus - jbonofre - js8544 - - laurentgo - vibhatha - ZhangHuiGui diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 2c840e95c8d..a6dd5f12753 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index bc7db519b64..cb000f8b95c 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -177,7 +177,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -237,7 +237,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -299,7 +299,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index 2b8e2bdeac0..738fb4fd824 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -574,7 +574,6 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, /** * gparquet_arrow_file_writer_new_row_group: * @writer: A #GParquetArrowFileWriter. - * @chunk_size: The max number of rows in a row group. * @error: (nullable): Return location for a #GError or %NULL. * * Start a new row group. @@ -584,13 +583,11 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, * Since: 18.0.0 */ gboolean -gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, - gsize chunk_size, - GError **error) +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); return garrow::check(error, - parquet_arrow_file_writer->NewRowGroup(chunk_size), + parquet_arrow_file_writer->NewRowGroup(), "[parquet][arrow][file-writer][new-row-group]"); } diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 2c82f7c1f87..4986430c951 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -135,9 +135,7 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GPARQUET_AVAILABLE_IN_18_0 gboolean -gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, - gsize chunk_size, - GError **error); +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, GError **error); GPARQUET_AVAILABLE_IN_18_0 gboolean diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index d8344bf1c50..418de4782d0 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -89,10 +89,10 @@ def test_write_table def test_write_chunked_array schema = build_schema("enabled" => :boolean) writer = Parquet::ArrowFileWriter.new(schema, @file.path) - writer.new_row_group(2) + writer.new_row_group chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])]) writer.write_chunked_array(chunked_array) - writer.new_row_group(1) + writer.new_row_group chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])]) writer.write_chunked_array(chunked_array) writer.close diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 4665a32e24b..751df9b2f3c 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -30,9 +30,5 @@ sphinx-lint sphinxcontrib-jquery sphinxcontrib-mermaid sphinx==6.2 -# Requirement for doctest-cython -# Needs upper pin of 0.3.0, see: -# https://github.com/lgpage/pytest-cython/issues/67 -# With 0.3.* bug fix release, the pin can be removed -pytest-cython==0.2.2 +pytest-cython pandas diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index f486d07ff88..fe3976248cc 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -84,6 +84,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler-grpc \ python3-dev \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 8dc778d544a..259c5fb77fa 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -106,6 +106,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ python3-dev \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 28cef294638..721b37dcae8 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -111,6 +111,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 3f486b09f95..592a9a6a232 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -111,6 +111,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index abfe6d274f7..f9459f4175c 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4573,11 +4573,16 @@ target_include_directories(arrow::hadoop INTERFACE "${HADOOP_HOME}/include") function(build_orc) message(STATUS "Building Apache ORC from source") + # Remove this and "patch" in "ci/docker/{debian,ubuntu}-*.dockerfile" once we have a patch for ORC 2.1.1 + find_program(PATCH patch REQUIRED) + set(ORC_PATCH_COMMAND ${PATCH} -p1 -i ${CMAKE_CURRENT_LIST_DIR}/orc.diff) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.29) fetchcontent_declare(orc ${FC_DECLARE_COMMON_OPTIONS} URL ${ORC_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") + URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" + PATCH_COMMAND ${ORC_PATCH_COMMAND}) prepare_fetchcontent() set(CMAKE_UNITY_BUILD FALSE) @@ -4667,16 +4672,10 @@ function(build_orc) OFF CACHE BOOL "" FORCE) - # We can remove this with ORC 2.0.2 or later. - list(PREPEND CMAKE_MODULE_PATH - ${CMAKE_CURRENT_BINARY_DIR}/_deps/orc-src/cmake_modules) - fetchcontent_makeavailable(orc) add_library(orc::orc INTERFACE IMPORTED) target_link_libraries(orc::orc INTERFACE orc) - target_include_directories(orc::orc INTERFACE "${orc_BINARY_DIR}/c++/include" - "${orc_SOURCE_DIR}/c++/include") list(APPEND ARROW_BUNDLED_STATIC_LIBS orc) else() @@ -4701,6 +4700,9 @@ function(build_orc) get_target_property(ORC_ZSTD_ROOT ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_ZSTD_ROOT "${ORC_ZSTD_ROOT}" DIRECTORY) + get_target_property(ORC_ZLIB_ROOT ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(ORC_ZLIB_ROOT "${ORC_ZLIB_ROOT}" DIRECTORY) + set(ORC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ORC_PREFIX}" @@ -4710,7 +4712,6 @@ function(build_orc) -DBUILD_TOOLS=OFF -DBUILD_CPP_TESTS=OFF -DINSTALL_VENDORED_LIBS=OFF - "-DLZ4_HOME=${ORC_LZ4_ROOT}" "-DPROTOBUF_EXECUTABLE=$" "-DPROTOBUF_HOME=${ORC_PROTOBUF_ROOT}" "-DPROTOBUF_INCLUDE_DIR=$" @@ -4718,16 +4719,17 @@ function(build_orc) "-DPROTOC_LIBRARY=$" "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_LIBRARY=$" + "-DLZ4_HOME=${ORC_LZ4_ROOT}" "-DLZ4_LIBRARY=$" "-DLZ4_STATIC_LIB=$" "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" - "-DZSTD_LIBRARY=$") - if(ZLIB_ROOT) - set(ORC_CMAKE_ARGS ${ORC_CMAKE_ARGS} "-DZLIB_HOME=${ZLIB_ROOT}") - endif() + "-DZSTD_LIBRARY=$" + "-DZLIB_HOME=${ORC_ZLIB_ROOT}" + "-DZLIB_INCLUDE_DIR=$" + "-DZLIB_LIBRARY=$") # Work around CMake bug file(MAKE_DIRECTORY ${ORC_INCLUDE_DIR}) @@ -4743,7 +4745,8 @@ function(build_orc) ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET} LZ4::lz4 - ZLIB::ZLIB) + ZLIB::ZLIB + PATCH_COMMAND ${ORC_PATCH_COMMAND}) add_library(orc::orc STATIC IMPORTED) set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") diff --git a/cpp/cmake_modules/orc.diff b/cpp/cmake_modules/orc.diff new file mode 100644 index 00000000000..7bdbfa1cf5d --- /dev/null +++ b/cpp/cmake_modules/orc.diff @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 1f8931508..f8e57bf5f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -30,8 +30,8 @@ SET(CPACK_PACKAGE_VERSION_MAJOR "2") + SET(CPACK_PACKAGE_VERSION_MINOR "1") + SET(CPACK_PACKAGE_VERSION_PATCH "0") + SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") +-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # For clang-tidy. ++list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + + option (BUILD_JAVA + "Include ORC Java library in the build process" +@@ -225,5 +225,3 @@ if (BUILD_CPP_TESTS) + ) + endif () + endif () +- +-INCLUDE(CheckFormat) +diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt +index 694667c06..af13a94aa 100644 +--- a/c++/src/CMakeLists.txt ++++ b/c++/src/CMakeLists.txt +@@ -218,8 +218,8 @@ target_include_directories (orc + INTERFACE + $ + PUBLIC +- $ +- $ ++ $ ++ $ + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} +diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake +index 017e6c5b8..fe376ed16 100644 +--- a/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cmake_modules/ThirdpartyToolchain.cmake +@@ -103,13 +103,13 @@ endif () + + # ---------------------------------------------------------------------- + # Macros for adding third-party libraries +-macro (add_resolved_library target_name link_lib include_dir) +- add_library (${target_name} INTERFACE IMPORTED) ++macro (orc_add_resolved_library target_name link_lib include_dir) ++ add_library (${target_name} INTERFACE IMPORTED GLOBAL) + target_link_libraries (${target_name} INTERFACE ${link_lib}) + target_include_directories (${target_name} SYSTEM INTERFACE ${include_dir}) + endmacro () + +-macro (add_built_library external_project_name target_name link_lib include_dir) ++macro (orc_add_built_library external_project_name target_name link_lib include_dir) + file (MAKE_DIRECTORY "${include_dir}") + + add_library (${target_name} STATIC IMPORTED) +@@ -122,7 +122,7 @@ macro (add_built_library external_project_name target_name link_lib include_dir) + endif () + endmacro () + +-function(provide_cmake_module MODULE_NAME) ++function(orc_provide_cmake_module MODULE_NAME) + set(module "${CMAKE_SOURCE_DIR}/cmake_modules/${MODULE_NAME}.cmake") + if(EXISTS "${module}") + message(STATUS "Providing CMake module for ${MODULE_NAME} as part of CMake package") +@@ -130,8 +130,8 @@ function(provide_cmake_module MODULE_NAME) + endif() + endfunction() + +-function(provide_find_module PACKAGE_NAME) +- provide_cmake_module("Find${PACKAGE_NAME}") ++function(orc_provide_find_module PACKAGE_NAME) ++ orc_provide_cmake_module("Find${PACKAGE_NAME}") + endfunction() + + # ---------------------------------------------------------------------- +@@ -156,7 +156,7 @@ ExternalProject_Add (orc-format_ep + # Snappy + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Snappy REQUIRED CONFIG) +- add_resolved_library (orc_snappy ${Snappy_LIBRARIES} ${Snappy_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${Snappy_LIBRARIES} ${Snappy_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -168,13 +168,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${SNAPPY_HOME}" STREQUAL "") + find_package (Snappy REQUIRED) + if (ORC_PREFER_STATIC_SNAPPY AND SNAPPY_STATIC_LIB) +- add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + else () +- add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_snappy ${SNAPPY_LIBRARY} ${SNAPPY_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES Snappy) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (Snappy) ++ orc_provide_find_module (Snappy) + else () + set(SNAPPY_HOME "${THIRDPARTY_DIR}/snappy_ep-install") + set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") +@@ -194,7 +194,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") + +- add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) ++ orc_add_built_library (snappy_ep orc_snappy ${SNAPPY_STATIC_LIB} ${SNAPPY_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_snappy|${SNAPPY_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -207,7 +207,7 @@ add_library (orc::snappy ALIAS orc_snappy) + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZLIB REQUIRED CONFIG) +- add_resolved_library (orc_zlib ${ZLIB_LIBRARIES} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARIES} ${ZLIB_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -219,13 +219,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${ZLIB_HOME}" STREQUAL "") + find_package (ZLIB REQUIRED) + if (ORC_PREFER_STATIC_ZLIB AND ZLIB_STATIC_LIB) +- add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + else () +- add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zlib ${ZLIB_LIBRARY} ${ZLIB_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZLIB) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (ZLIB) ++ orc_provide_find_module (ZLIB) + else () + set(ZLIB_PREFIX "${THIRDPARTY_DIR}/zlib_ep-install") + set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") +@@ -252,7 +252,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}") + +- add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) ++ orc_add_built_library (zlib_ep orc_zlib ${ZLIB_STATIC_LIB} ${ZLIB_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zlib|${ZLIB_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -265,7 +265,7 @@ add_library (orc::zlib ALIAS orc_zlib) + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (ZSTD REQUIRED CONFIG) +- add_resolved_library (orc_zstd ${zstd_LIBRARIES} ${zstd_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${zstd_LIBRARIES} ${zstd_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -277,14 +277,14 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${ZSTD_HOME}" STREQUAL "") + find_package (ZSTD REQUIRED) + if (ORC_PREFER_STATIC_ZSTD AND ZSTD_STATIC_LIB) +- add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + else () +- add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_zstd ${ZSTD_LIBRARY} ${ZSTD_INCLUDE_DIR}) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$,zstd::libzstd_shared,zstd::libzstd_static>>") + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES ZSTD) +- provide_find_module (ZSTD) ++ orc_provide_find_module (ZSTD) + else () + set(ZSTD_HOME "${THIRDPARTY_DIR}/zstd_ep-install") + set(ZSTD_INCLUDE_DIR "${ZSTD_HOME}/include") +@@ -318,7 +318,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS ${ZSTD_STATIC_LIB}) + +- add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) ++ orc_add_built_library (zstd_ep orc_zstd ${ZSTD_STATIC_LIB} ${ZSTD_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_zstd|${ZSTD_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -330,7 +330,7 @@ add_library (orc::zstd ALIAS orc_zstd) + # LZ4 + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (LZ4 REQUIRED CONFIG) +- add_resolved_library (orc_lz4 ${lz4_LIBRARIES} ${lz4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${lz4_LIBRARIES} ${lz4_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -342,13 +342,13 @@ elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") + elseif (NOT "${LZ4_HOME}" STREQUAL "") + find_package (LZ4 REQUIRED) + if (ORC_PREFER_STATIC_LZ4 AND LZ4_STATIC_LIB) +- add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + else () +- add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_lz4 ${LZ4_LIBRARY} ${LZ4_INCLUDE_DIR}) + endif () + list (APPEND ORC_SYSTEM_DEPENDENCIES LZ4) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (LZ4) ++ orc_provide_find_module (LZ4) + else () + set(LZ4_PREFIX "${THIRDPARTY_DIR}/lz4_ep-install") + set(LZ4_INCLUDE_DIR "${LZ4_PREFIX}/include") +@@ -375,7 +375,7 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) + +- add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) ++ orc_add_built_library (lz4_ep orc_lz4 ${LZ4_STATIC_LIB} ${LZ4_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_lz4|${LZ4_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -491,7 +491,7 @@ endif () + + if (ORC_PACKAGE_KIND STREQUAL "conan") + find_package (Protobuf REQUIRED CONFIG) +- add_resolved_library (orc_protobuf ${protobuf_LIBRARIES} ${protobuf_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${protobuf_LIBRARIES} ${protobuf_INCLUDE_DIR}) + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") + elseif (ORC_PACKAGE_KIND STREQUAL "vcpkg") +@@ -505,20 +505,20 @@ elseif (NOT "${PROTOBUF_HOME}" STREQUAL "") + find_package (Protobuf REQUIRED) + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOBUF_STATIC_LIB) +- add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () +- add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protobuf ${PROTOBUF_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + if (ORC_PREFER_STATIC_PROTOBUF AND PROTOC_STATIC_LIB) +- add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + else () +- add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_resolved_library (orc_protoc ${PROTOC_LIBRARY} ${PROTOBUF_INCLUDE_DIR}) + endif () + + list (APPEND ORC_SYSTEM_DEPENDENCIES Protobuf) + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +- provide_find_module (Protobuf) ++ orc_provide_find_module (Protobuf) + else () + set(PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") + set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") +@@ -556,8 +556,8 @@ else () + ${THIRDPARTY_LOG_OPTIONS} + BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}") + +- add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) +- add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_built_library (protobuf_ep orc_protobuf ${PROTOBUF_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) ++ orc_add_built_library (protobuf_ep orc_protoc ${PROTOC_STATIC_LIB} ${PROTOBUF_INCLUDE_DIR}) + + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_protobuf|${PROTOBUF_STATIC_LIB_NAME}") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$") +@@ -610,7 +610,7 @@ if(BUILD_LIBHDFSPP) + BUILD_BYPRODUCTS "${LIBHDFSPP_STATIC_LIB}" + CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS}) + +- add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) ++ orc_add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB} ${LIBHDFSPP_INCLUDE_DIR}) + + set (LIBHDFSPP_LIBRARIES + libhdfspp diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6e2294371e7..7a0787a4edc 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -756,6 +756,7 @@ if(ARROW_COMPUTE) compute/kernels/aggregate_tdigest.cc compute/kernels/aggregate_var_std.cc compute/kernels/hash_aggregate.cc + compute/kernels/pivot_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_compare.cc @@ -771,13 +772,14 @@ if(ARROW_COMPUTE) compute/kernels/scalar_validity.cc compute/kernels/vector_array_sort.cc compute/kernels/vector_cumulative_ops.cc - compute/kernels/vector_pairwise.cc compute/kernels/vector_nested.cc + compute/kernels/vector_pairwise.cc compute/kernels/vector_rank.cc compute/kernels/vector_replace.cc compute/kernels/vector_run_end_encode.cc compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc + compute/kernels/vector_swizzle.cc compute/key_hash_internal.cc compute/key_map_internal.cc compute/light_array_internal.cc diff --git a/cpp/src/arrow/acero/accumulation_queue.h b/cpp/src/arrow/acero/accumulation_queue.h index a173f984038..92d62d5d99d 100644 --- a/cpp/src/arrow/acero/accumulation_queue.h +++ b/cpp/src/arrow/acero/accumulation_queue.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/acero/visibility.h" #include "arrow/compute/exec.h" #include "arrow/result.h" @@ -70,7 +71,7 @@ class AccumulationQueue { /// For example, in a top-n node, the process callback should determine how many /// rows need to be delivered for the given batch, and then return a task to actually /// deliver those rows. -class SequencingQueue { +class ARROW_ACERO_EXPORT SequencingQueue { public: using Task = std::function; @@ -123,7 +124,7 @@ class SequencingQueue { /// /// It can be helpful to think of this as if a dedicated thread is running Process as /// batches arrive -class SerialSequencingQueue { +class ARROW_ACERO_EXPORT SerialSequencingQueue { public: /// Strategy that describes how to handle items class Processor { diff --git a/cpp/src/arrow/acero/groupby_aggregate_node.cc b/cpp/src/arrow/acero/groupby_aggregate_node.cc index 06b034ab2d4..cd98afbeb0a 100644 --- a/cpp/src/arrow/acero/groupby_aggregate_node.cc +++ b/cpp/src/arrow/acero/groupby_aggregate_node.cc @@ -96,9 +96,13 @@ Result> GroupByNode::MakeAggregateNodeArg // Find input field indices for aggregates std::vector> agg_src_fieldsets(aggs.size()); + // ARROW_LOG(INFO) << "input schema: " << input_schema->ToString(); for (size_t i = 0; i < aggs.size(); ++i) { const auto& target_fieldset = aggs[i].target; + // ARROW_LOG(INFO) << "target #" << i << " has " << target_fieldset.size() << " + // targets"; for (const auto& target : target_fieldset) { + // ARROW_LOG(INFO) << " ... " << target.ToString(); ARROW_ASSIGN_OR_RAISE(auto match, target.FindOne(*input_schema)); agg_src_fieldsets[i].push_back(match[0]); } @@ -108,6 +112,8 @@ Result> GroupByNode::MakeAggregateNodeArg std::vector> agg_src_types(aggs.size()); for (size_t i = 0; i < aggs.size(); ++i) { for (const auto& agg_src_field_id : agg_src_fieldsets[i]) { + // ARROW_LOG(INFO) << "target #" << i << " field = " << + // input_schema->field(agg_src_field_id)->ToString(); agg_src_types[i].push_back(input_schema->field(agg_src_field_id)->type().get()); } } @@ -282,6 +288,11 @@ Status GroupByNode::Merge() { DCHECK(state0->agg_states[span_i]); batch_ctx.SetState(state0->agg_states[span_i].get()); + // XXX this resizes each KernelState (state0->agg_states[span_i]) multiple times. + // An alternative would be a two-pass algorithm: + // 1. Compute all transpositions (one per local state) and the final number of + // groups. + // 2. Process all agg kernels, resizing each KernelState only once. RETURN_NOT_OK( agg_kernels_[span_i]->resize(&batch_ctx, state0->grouper->num_groups())); RETURN_NOT_OK(agg_kernels_[span_i]->merge( diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 1e2975afc91..301368ffabb 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -33,13 +33,9 @@ #include "arrow/array/concatenate.h" #include "arrow/chunked_array.h" #include "arrow/compute/api_aggregate.h" -#include "arrow/compute/api_scalar.h" -#include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" -#include "arrow/compute/kernels/aggregate_internal.h" -#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/compute/row/grouper_internal.h" @@ -51,9 +47,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/async_generator.h" -#include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" @@ -65,7 +59,6 @@ using testing::HasSubstr; namespace arrow { -using internal::BitmapReader; using internal::checked_cast; using internal::checked_pointer_cast; using internal::ToChars; @@ -76,6 +69,7 @@ using compute::default_exec_context; using compute::ExecSpan; using compute::FunctionOptions; using compute::Grouper; +using compute::PivotOptions; using compute::RowSegmenter; using compute::ScalarAggregateOptions; using compute::Segment; @@ -1489,6 +1483,7 @@ class GroupBy : public ::testing::TestWithParam { return acero::GroupByTest(GetParam(), arguments, keys, aggregates, use_threads); } + // TODO why not rename this to GroupByTest? Result AltGroupBy(const std::vector& arguments, const std::vector& keys, const std::vector& segment_keys, @@ -5269,6 +5264,60 @@ TEST_P(GroupBy, OnlyKeys) { } } +// TODO unused key_names +// TODO unexpected key_names +// TODO duplicate values +// TODO duplicate keys +// TODO nulls in keys +// TODO nulls in values +TEST_P(GroupBy, PivotFloatValues) { + auto value_type = float32(); + for (bool use_threads : {false, true}) { + for (const auto& key_type : BaseBinaryTypes()) { + ARROW_SCOPED_TRACE("key_type = ", *key_type); + ARROW_SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + + auto table = + TableFromJSON(schema({field("group_key", int64()), field("key", utf8()), + field("value", value_type)}), + {R"([ + [1, "width", 10.5], + [2, "width", 11.5] + ])", + R"([ + [2, "height", 12.5], + [3, "width", 13.5], + [1, "height", 14.5] + ])"}); + + auto options = + std::make_shared(PivotOptions(/*key_names=*/{"height", "width"})); + Aggregate agg{"hash_pivot", options, + /*target=*/std::vector{"agg_0", "agg_1"}, /*name=*/"out"}; + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + AltGroupBy({table->GetColumnByName("key"), table->GetColumnByName("value")}, + {table->GetColumnByName("group_key")}, + /*segment_keys=*/{}, {agg}, use_threads)); + ValidateOutput(aggregated_and_grouped); + + AssertDatumsEqual( + ArrayFromJSON(struct_({ + field("key_0", int64()), + field("out", struct_({field("height", value_type), + field("width", value_type)})), + }), + R"([ + [1, {"height": 14.5, "width": 10.5} ], + [2, {"height": 12.5, "width": 11.5} ], + [3, {"height": null, "width": 13.5} ] + ])"), + aggregated_and_grouped, + /*verbose=*/true); + } + } +} + INSTANTIATE_TEST_SUITE_P(GroupBy, GroupBy, ::testing::Values(RunGroupByImpl)); class SegmentedScalarGroupBy : public GroupBy {}; diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 76ad9c7d650..7dbed7163da 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3370,8 +3370,10 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { constexpr int value_no_match_length_min = 128; constexpr int value_no_match_length_max = 129; constexpr int value_match_length = 130; + // The value "DDD..." will be hashed to the partition over 4GB of the hash table. + // Matching at this area gives us more coverage. const auto value_match = - std::make_shared(std::string(value_match_length, 'X')); + std::make_shared(std::string(value_match_length, 'D')); constexpr int16_t num_rows_per_batch_left = 128; constexpr int16_t num_rows_per_batch_right = 4096; const int64_t num_batches_left = 8; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index c068eeb50ff..fc3be1b462e 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -439,11 +439,11 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, num_rows = 0; num_bytes = 0; for (size_t i = 0; i < sources.size(); ++i) { - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; num_rows += sources[i]->rows_.length(); num_bytes += sources[i]->rows_.offsets()[sources[i]->rows_.length()]; } - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; } return Status::OK(); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index b9d6c53215b..b3c314fccc0 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -235,7 +235,7 @@ void AssertTableWriteReadEqual(const std::vector>& input_ write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); @@ -272,7 +272,7 @@ void AssertBatchWriteReadEqual( write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); @@ -330,7 +330,7 @@ std::unique_ptr CreateWriter(uint64_t stripe_size, liborc::OutputStream* stream) { liborc::WriterOptions options; options.setStripeSize(stripe_size); - options.setCompressionBlockSize(1024); + options.setCompressionBlockSize(64 * 1024); options.setMemoryPool(liborc::getDefaultPool()); options.setRowIndexStride(0); return liborc::createWriter(type, stream, options); @@ -668,7 +668,7 @@ TEST_F(TestORCWriterTrivialNoWrite, noWrite) { write_options.compression = Compression::UNCOMPRESSED; #endif write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; + write_options.compression_block_size = 64 * 1024; write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 49d87096606..d8eeb6366dc 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -24,8 +24,8 @@ #include "arrow/util/logging.h" namespace arrow { - namespace internal { + template <> struct EnumTraits : BasicEnumTraits return ""; } }; + +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "PivotOptions::UnexpectedKeyBehavior"; } + static std::string value_name(compute::PivotOptions::UnexpectedKeyBehavior value) { + switch (value) { + case compute::PivotOptions::kIgnore: + return "kIgnore"; + case compute::PivotOptions::kRaise: + return "kRaise"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -101,6 +118,9 @@ static auto kTDigestOptionsType = GetFunctionOptionsType( DataMember("buffer_size", &TDigestOptions::buffer_size), DataMember("skip_nulls", &TDigestOptions::skip_nulls), DataMember("min_count", &TDigestOptions::min_count)); +static auto kPivotOptionsType = GetFunctionOptionsType( + DataMember("key_names", &PivotOptions::key_names), + DataMember("unexpected_key_behavior", &PivotOptions::unexpected_key_behavior)); static auto kIndexOptionsType = GetFunctionOptionsType(DataMember("value", &IndexOptions::value)); } // namespace @@ -164,6 +184,13 @@ TDigestOptions::TDigestOptions(std::vector q, uint32_t delta, min_count{min_count} {} constexpr char TDigestOptions::kTypeName[]; +PivotOptions::PivotOptions(std::vector key_names, + UnexpectedKeyBehavior unexpected_key_behavior) + : FunctionOptions(internal::kPivotOptionsType), + key_names(std::move(key_names)), + unexpected_key_behavior(unexpected_key_behavior) {} +PivotOptions::PivotOptions() : FunctionOptions(internal::kPivotOptionsType) {} + IndexOptions::IndexOptions(std::shared_ptr value) : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {} IndexOptions::IndexOptions() : IndexOptions(std::make_shared()) {} @@ -177,6 +204,7 @@ void RegisterAggregateOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPivotOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 2e5210b073e..886d992640c 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -175,6 +175,75 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions { uint32_t min_count; }; +/// \brief Control Pivot kernel behavior +/// +/// These options apply to the "pivot" (TODO) and "hash_pivot" (TODO) functions. +/// +/// Constraints: +/// - The corresponding `Aggregate::target` must have two FieldRef elements; +/// the first one points to the pivot key column, the second points to the +/// pivoted data column. +/// - The pivot key column must be string-like; its values will be matched +/// against `key_names` in order to dispatch the pivoted data into the +/// output. +/// +/// "hash_pivot" example +/// -------------------- +/// +/// Assuming the following input with schema +/// `{"group": int32, "key": utf8, "value": int16}`: +/// ``` +/// group | key | value +/// ----------------------------- +/// 1 | height | 11 +/// 1 | width | 12 +/// 2 | width | 13 +/// 3 | height | 14 +/// 3 | depth | 15 +/// ``` +/// and the following settings: +/// - a hash grouping key "group" +/// - Aggregate( +/// .function = "hash_pivot", +/// .options = PivotOptions(.key_names = {"height", "width"}), +/// .target = {"key", "value"}, +/// .name = {"props"}) +/// +/// then the output will have the schema +/// `{"group": int32, "props": struct{"height": int16, "width": int16}}` +/// and the following value: +/// ``` +/// group | props +/// | height | width +/// ----------------------------- +/// 1 | 11 | 12 +/// 2 | null | 13 +/// 3 | 14 | null +/// ``` +class ARROW_EXPORT PivotOptions : public FunctionOptions { + public: + // Configure the behavior of pivot keys not in `key_names` + enum UnexpectedKeyBehavior { + // Unexpected pivot keys are ignored silently + kIgnore, + // Unexpected pivot keys return a KeyError + kRaise + }; + // TODO should duplicate key behavior be configurable as well? + + explicit PivotOptions(std::vector key_names, + UnexpectedKeyBehavior unexpected_key_behavior = kIgnore); + // Default constructor for serialization + PivotOptions(); + static constexpr char const kTypeName[] = "PivotOptions"; + static PivotOptions Defaults() { return PivotOptions{}; } + + // The values expected in the pivot key column + std::vector key_names; + // The behavior when pivot keys not in `key_names` are encountered + UnexpectedKeyBehavior unexpected_key_behavior = kIgnore; +}; + /// \brief Control Index kernel behavior class ARROW_EXPORT IndexOptions : public FunctionOptions { public: diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index f0d5c0fcc3d..22ecf1cc878 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -155,6 +155,12 @@ static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( DataMember("recursive", &ListFlattenOptions::recursive)); +static auto kInversePermutationOptionsType = + GetFunctionOptionsType( + DataMember("max_index", &InversePermutationOptions::max_index), + DataMember("output_type", &InversePermutationOptions::output_type)); +static auto kScatterOptionsType = GetFunctionOptionsType( + DataMember("max_index", &ScatterOptions::max_index)); } // namespace } // namespace internal @@ -230,6 +236,17 @@ ListFlattenOptions::ListFlattenOptions(bool recursive) : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} constexpr char ListFlattenOptions::kTypeName[]; +InversePermutationOptions::InversePermutationOptions( + int64_t max_index, std::shared_ptr output_type) + : FunctionOptions(internal::kInversePermutationOptionsType), + max_index(max_index), + output_type(std::move(output_type)) {} +constexpr char InversePermutationOptions::kTypeName[]; + +ScatterOptions::ScatterOptions(int64_t max_index) + : FunctionOptions(internal::kScatterOptionsType), max_index(max_index) {} +constexpr char ScatterOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -244,6 +261,8 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kInversePermutationOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kScatterOptionsType)); } } // namespace internal @@ -429,5 +448,19 @@ Result CumulativeMean(const Datum& values, const CumulativeOptions& optio return CallFunction("cumulative_mean", {Datum(values)}, &options, ctx); } +// ---------------------------------------------------------------------- +// Swizzle functions + +Result InversePermutation(const Datum& indices, + const InversePermutationOptions& options, + ExecContext* ctx) { + return CallFunction("inverse_permutation", {indices}, &options, ctx); +} + +Result Scatter(const Datum& values, const Datum& indices, + const ScatterOptions& options, ExecContext* ctx) { + return CallFunction("scatter", {values, indices}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index e5bcc373296..ada1665b3ec 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -257,6 +257,40 @@ class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { bool recursive = false; }; +/// \brief Options for inverse_permutation function +class ARROW_EXPORT InversePermutationOptions : public FunctionOptions { + public: + explicit InversePermutationOptions(int64_t max_index = -1, + std::shared_ptr output_type = NULLPTR); + static constexpr char const kTypeName[] = "InversePermutationOptions"; + static InversePermutationOptions Defaults() { return InversePermutationOptions(); } + + /// \brief The max value in the input indices to allow. The length of the function's + /// output will be this value plus 1. If negative, this value will be set to the length + /// of the input indices minus 1 and the length of the function's output will be the + /// length of the input indices. + int64_t max_index = -1; + /// \brief The type of the output inverse permutation. If null, the output will be of + /// the same type as the input indices, otherwise must be signed integer type. An + /// invalid error will be reported if this type is not able to store the length of the + /// input indices. + std::shared_ptr output_type = NULLPTR; +}; + +/// \brief Options for scatter function +class ARROW_EXPORT ScatterOptions : public FunctionOptions { + public: + explicit ScatterOptions(int64_t max_index = -1); + static constexpr char const kTypeName[] = "ScatterOptions"; + static ScatterOptions Defaults() { return ScatterOptions(); } + + /// \brief The max value in the input indices to allow. The length of the function's + /// output will be this value plus 1. If negative, this value will be set to the length + /// of the input indices minus 1 and the length of the function's output will be the + /// length of the input indices. + int64_t max_index = -1; +}; + /// @} /// \brief Filter with a boolean selection filter @@ -705,5 +739,58 @@ Result> PairwiseDiff(const Array& array, bool check_overflow = false, ExecContext* ctx = NULLPTR); +/// \brief Return the inverse permutation of the given indices. +/// +/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x +/// does not appear in the input indices. Indices must be in the range of [0, max_index], +/// or null, which will be ignored. If multiple indices point to the same value, the last +/// one is used. +/// +/// For example, with +/// indices = [null, 0, null, 2, 4, 1, 1] +/// the inverse permutation is +/// [1, 6, 3, null, 4, null, null] +/// if max_index = 6. +/// +/// \param[in] indices array-like indices +/// \param[in] options configures the max index and the output type +/// \param[in] ctx the function execution context, optional +/// \return the resulting inverse permutation +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result InversePermutation( + const Datum& indices, + const InversePermutationOptions& options = InversePermutationOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Scatter the values into specified positions according to the indices. +/// +/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear +/// in the input indices. Indices must be in the range of [0, max_index], or null, in +/// which case the corresponding value will be ignored. If multiple indices point to the +/// same value, the last one is used. +/// +/// For example, with +/// values = [a, b, c, d, e, f, g] +/// indices = [null, 0, null, 2, 4, 1, 1] +/// the output is +/// [b, g, d, null, e, null, null] +/// if max_index = 6. +/// +/// \param[in] values datum to scatter +/// \param[in] indices array-like indices +/// \param[in] options configures the max index of to scatter +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 20.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Scatter(const Datum& values, const Datum& indices, + const ScatterOptions& options = ScatterOptions::Defaults(), + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index c269de07632..b7d017d4820 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -136,6 +136,10 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new SelectKOptions(5, {{SortKey("key", SortOrder::Ascending)}})); options.emplace_back(new Utf8NormalizeOptions()); options.emplace_back(new Utf8NormalizeOptions(Utf8NormalizeOptions::NFD)); + options.emplace_back( + new InversePermutationOptions(/*max_index=*/42, /*output_type=*/int32())); + options.emplace_back(new ScatterOptions()); + options.emplace_back(new ScatterOptions(/*max_index=*/42)); for (size_t i = 0; i < options.size(); i++) { const size_t prev_i = i == 0 ? options.size() - 1 : i - 1; diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 7c7b9c8b68d..84b508f5d9b 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -115,6 +115,12 @@ add_arrow_compute_test(vector_selection_test EXTRA_LINK_LIBS arrow_compute_kernels_testing) +add_arrow_compute_test(vector_swizzle_test + SOURCES + vector_swizzle_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) + add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 594bd1fce0b..2a492f581f5 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -1037,8 +1037,9 @@ ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) { // Generate a kernel given a templated functor for integer types // // See "Numeric" above for description of the generator functor -template