diff --git a/.asf.yaml b/.asf.yaml index 2c66ce5be63..4596f7ac5d0 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -18,9 +18,15 @@ github: description: "Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing" homepage: https://arrow.apache.org/ + collaborators: + - assignUser + - benibus + - milesgranger + - toddfarmer notifications: commits: commits@arrow.apache.org + issues_status: issues@arrow.apache.org issues: github@arrow.apache.org pullrequests: github@arrow.apache.org jira_options: link label worklog diff --git a/.dockerignore b/.dockerignore index 5d6d171fdec..3791cca95e3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -61,3 +61,5 @@ !rust/datafusion/Cargo.toml !rust/datafusion/benches !rust/integration-testing/Cargo.toml +!go/go.mod +!go/go.sum \ No newline at end of file diff --git a/.env b/.env index 2f06cca474b..d93eab06ffb 100644 --- a/.env +++ b/.env @@ -47,24 +47,27 @@ ULIMIT_CORE=-1 # Default versions for platforms ALMALINUX=8 +ALPINE_LINUX=3.16 DEBIAN=11 FEDORA=35 UBUNTU=20.04 # Default versions for various dependencies -CLANG_TOOLS=12 -CUDA=9.1 +CLANG_TOOLS=14 +CUDA=11.0.3 DASK=latest DOTNET=6.0 GCC_VERSION="" -GO=1.16 +GO=1.17 +STATICCHECK=v0.2.2 HDFS=3.2.1 JDK=8 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. -LLVM=13 +LLVM=14 MAVEN=3.5.4 NODE=16 +NUMBA=latest NUMPY=latest PANDAS=latest PYTHON=3.8 @@ -83,8 +86,8 @@ ARROW_R_DEV=TRUE R_PRUNE_DEPS=FALSE TZ=UTC -# -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n -DEVTOOLSET_VERSION=-1 +# Any non-empty string will install devtoolset-${DEVTOOLSET_VERSION} +DEVTOOLSET_VERSION= # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the @@ -93,7 +96,8 @@ DEVTOOLSET_VERSION=-1 # Please also update the crossbow configuration in order to keep the github # actions cache up to date for the macOS wheels: # https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml -VCPKG="38bb87c" +# vcpkg minimum version "09adfdc8cdad76345b7cc7f3305899e1cbd66297" due to CVE-2022-3786 +VCPKG="2871ddd918cecb9cb642bcb9c56897f397283192" # This must be updated when we update # ci/docker/python-wheel-windows-vs2017.dockerfile. diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index bbabe358579..249f159ec48 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -23,7 +23,7 @@ There are many ways to contribute to Apache Arrow: * Contributing code (we call them "patches") * Writing documentation (another form of code, in a way) -* Participating in discussions on JIRA or the mailing list +* Participating in discussions on GitHub issues or the mailing list * Helping users of the libraries ## Reporting bugs and asking questions diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 00000000000..bea5b96ada4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Bug Report +description: File a bug report +labels: ["Type: bug"] +assignees: [] +body: + - type: textarea + id: description + attributes: + label: Describe the bug, including details regarding any error messages, version, and platform. + description: Please include what you expected. + validations: + required: true + - type: dropdown + id: component + attributes: + label: Component(s) + multiple: true + options: + - Archery + - Benchmarking + - C + - C# + - C++ + - C++ - Gandiva + - C++ - Plasma + - Continuous Integration + - Developer Tools + - Documentation + - FlightRPC + - Format + - GLib + - Go + - GPU + - Integration + - Java + - JavaScript + - MATLAB + - Packaging + - Parquet + - Python + - R + - Release + - Ruby + - Swift + - Website + - Other + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 5a050121362..960a754a28d 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -16,7 +16,4 @@ # under the License. blank_issues_enabled: false -contact_links: - - name: Report an issue - url: https://issues.apache.org/jira/browse/ARROW - about: Please report bugs and request features on JIRA. + diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml new file mode 100644 index 00000000000..1be5d1191d1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Enhancement Request +description: Request an enhancement to the project +labels: ["Type: enhancement"] +assignees: [] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to share your feedback on ways Apache Arrow can be improved! + - type: textarea + id: description + attributes: + label: Describe the enhancement requested + validations: + required: true + - type: dropdown + id: component + attributes: + label: Component(s) + multiple: true + options: + - Archery + - Benchmarking + - C + - C# + - C++ + - C++ - Gandiva + - C++ - Plasma + - Continuous Integration + - Developer Tools + - Documentation + - FlightRPC + - Format + - GLib + - Go + - GPU + - Integration + - Java + - JavaScript + - MATLAB + - Packaging + - Parquet + - Python + - R + - Release + - Ruby + - Swift + - Website + - Other + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md deleted file mode 100644 index 9c4b89c5697..00000000000 --- a/.github/ISSUE_TEMPLATE/question.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: Ask a question -about: Please ask questions at user@arrow.apache.org ---- - -STOP! Are you reporting a bug, a possible bug, or requesting a -feature? If so, please report under the ARROW project on the ASF JIRA -server https://issues.apache.org/jira/browse/ARROW. This JIRA server -is free to use and open to the public, but you must create an account -if it is your first time. - -See our contribution guidelines for more information: -http://arrow.apache.org/docs/developers/contributing.html - -We have GitHub issues available as a way for new contributors and -passers-by who are unfamiliar with Apache Software Foundation projects -to ask questions and interact with the project. Do not be surprised if -the first response is to open a JIRA issue or to write an e-mail to -one of the public mailing lists: - -* Development discussions: dev@arrow.apache.org (first subscribe by - sending an e-mail to dev-subscribe@arrow.apache.org). -* User discussions: user@arrow.apache.org (first subscribe by - sending an e-mail to user-subscribe@arrow.apache.org). - -Thank you! diff --git a/.github/ISSUE_TEMPLATE/usage_question.yaml b/.github/ISSUE_TEMPLATE/usage_question.yaml new file mode 100644 index 00000000000..0cec8bf10b4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/usage_question.yaml @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Usage Question +description: Ask a question +labels: ["Type: usage"] +assignees: [] +body: + - type: markdown + attributes: + value: > + While we enable issues as a mechanism for new contributors and passers-by who + are unfamiliar with Apache Software Foundation projects to ask questions and + interact with the project, we encourage users to ask such questions on public + mailing lists: + + * Development discussions: dev@arrow.apache.org (first subscribe by sending an + e-mail to dev-subscribe@arrow.apache.org). + + * User discussions: user@arrow.apache.org (first subscribe by sending an e-mail + to user-subscribe@arrow.apache.org). + + * Mailing list archives: https://arrow.apache.org/community/ + + + Do not be surprised by responses to issues raised here directing you to those + mailing lists, or to report a bug or feature request here. + + + Thank you! + - type: textarea + id: description + attributes: + label: > + Describe the usage question you have. Please include as many useful details as + possible. + validations: + required: true + - type: dropdown + id: component + attributes: + label: Component(s) + multiple: true + options: + - Archery + - Benchmarking + - C + - C# + - C++ + - C++ - Gandiva + - C++ - Plasma + - Continuous Integration + - Developer Tools + - Documentation + - FlightRPC + - Format + - GLib + - Go + - GPU + - Integration + - Java + - JavaScript + - MATLAB + - Packaging + - Parquet + - Python + - R + - Release + - Ruby + - Swift + - Website + - Other + validations: + required: true diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000000..62878045451 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,59 @@ + + + +### Rationale for this change + + + +### What changes are included in this PR? + + + +### Are these changes tested? + + + +### Are there any user-facing changes? + + + + \ No newline at end of file diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index ce5092c8fee..d337ec797cf 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -31,10 +31,16 @@ on: - 'dev/tasks/**' - 'docker-compose.yml' +env: + ARCHERY_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + jobs: test: @@ -49,9 +55,7 @@ jobs: fetch-depth: 0 - name: Git Fixup shell: bash - run: | - DEFAULT_BRANCH=${{ github.event.repository.default_branch }} - git branch $DEFAULT_BRANCH origin/$DEFAULT_BRANCH || true + run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index d473593adfc..0711a23f753 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -24,6 +24,10 @@ on: - created - edited +permissions: + contents: read + pull-requests: write + jobs: crossbow: name: Listen! @@ -66,8 +70,9 @@ jobs: DEFAULT_BRANCH=${{ github.event.repository.default_branch }} git remote add upstream https://github.com/apache/arrow git fetch upstream + changed() { - git diff --name-only HEAD..upstream/$DEFAULT_BRANCH | grep -e "$1" >/dev/null 2>&1 + git diff --name-only upstream/$DEFAULT_BRANCH... | grep -e "$1" >/dev/null 2>&1 } if changed '^r/.*\.R$'; then echo "R_DOCS=true" >> $GITHUB_ENV @@ -82,7 +87,7 @@ jobs: if changed '^r/src'; then echo "CLANG_FORMAT_R=true" >> $GITHUB_ENV fi - - name: Ensure clang-format has the appropriate versoin + - name: Ensure clang-format has the appropriate version if: env.CMAKE_FORMAT == 'true' || env.CLANG_FORMAT_CPP == 'true' || env.CLANG_FORMAT_R == 'true' || @@ -124,18 +129,16 @@ jobs: run: | source("ci/etc/rprofile") install.packages(c("remotes", "roxygen2")) - # We currently need dev roxygen2 (> 7.1.1) until they release - remotes::install_github("r-lib/roxygen2") remotes::install_deps("r") roxygen2::roxygenize("r") - name: Style R code if: env.R_CODE == 'true' || endsWith(github.event.comment.body, 'everything') shell: Rscript {0} run: | - changed_files <- system("git diff --name-only HEAD..upstream/${{ github.event.repository.default_branch }} 2>&1", intern = TRUE) + changed_files <- system("git diff --name-only upstream/${{ github.event.repository.default_branch }}... 2>&1", intern = TRUE) # only grab the .R files under r/ changed_files <- grep('^r/.*\\.R$', changed_files, value = TRUE) - # remove latin1 which is unstylable due to encoding and codegen.R which is unique + # remove codegen.R and other possible exclusions changed_files <- changed_files[!changed_files %in% file.path("r", source("r/.styler_excludes.R")$value)] source("ci/etc/rprofile") install.packages(c("remotes", "styler")) @@ -171,3 +174,21 @@ jobs: with: repo-token: ${{ secrets.GITHUB_TOKEN }} args: "--force" + + issue_assign: + name: "Assign issue" + permissions: + issues: write + if: github.event.comment.body == 'take' + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v3 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.issues.addAssignees({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.issue.number, + assignees: context.payload.comment.user.login + }); diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index ba95fcd509c..4959197fcd9 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -45,18 +45,19 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: docker: name: ${{ matrix.title }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 + timeout-minutes: 75 strategy: fail-fast: false matrix: @@ -67,7 +68,9 @@ jobs: - image: conda-cpp title: AMD64 Conda C++ - image: ubuntu-cpp-sanitizer - title: AMD64 Ubuntu 20.04 C++ ASAN UBSAN + title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN + env: + UBUNTU: "22.04" steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -75,7 +78,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} @@ -87,12 +90,18 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited archery docker run ${{ matrix.image }} - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ${{ matrix.image }} @@ -101,8 +110,6 @@ jobs: runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 45 - strategy: - fail-fast: false steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -119,12 +126,10 @@ jobs: docker-compose run --rm minimal macos: - name: AMD64 MacOS 10.15 C++ + name: AMD64 macOS 12 C++ runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - strategy: - fail-fast: false + timeout-minutes: 75 env: ARROW_BUILD_TESTS: ON ARROW_DATASET: ON @@ -146,8 +151,7 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - # System Abseil installed by Homebrew uses C++ 17 - CMAKE_CXX_STANDARD: 17 + GTest_SOURCE: BUNDLED steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -156,9 +160,12 @@ jobs: submodules: recursive - name: Install Dependencies run: | - rm -f /usr/local/bin/2to3 + rm -f /usr/local/bin/2to3* || : + rm -f /usr/local/bin/idle3* || : + rm -f /usr/local/bin/pydoc3* || : + rm -f /usr/local/bin/python3* || : + rm -f /usr/local/bin/python3-config || : brew update --preinstall - brew install --overwrite git brew bundle --file=cpp/Brewfile - name: Install MinIO run: | @@ -172,15 +179,16 @@ jobs: - name: ccache info id: ccache-info run: | - echo "::set-output name=cache-dir::$(ccache --get-config cache_dir)" + echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT - name: Cache ccache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.ccache-info.outputs.cache-dir }} key: cpp-ccache-macos-${{ hashFiles('cpp/**') }} restore-keys: cpp-ccache-macos- - name: Build - run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build + run: | + ci/scripts/cpp_build.sh $(pwd) $(pwd)/build - name: Test shell: bash run: | @@ -193,7 +201,7 @@ jobs: name: AMD64 ${{ matrix.name }} C++17 runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 45 + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -202,20 +210,19 @@ jobs: include: - os: windows-2019 name: Windows 2019 - generator: Visual Studio 16 2019 env: ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON - ARROW_CXXFLAGS: "/std:c++17" ARROW_DATASET: ON ARROW_FLIGHT: OFF ARROW_HDFS: ON ARROW_HOME: /usr ARROW_JEMALLOC: OFF ARROW_MIMALLOC: ON + ARROW_ORC: ON ARROW_PARQUET: ON ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF @@ -227,11 +234,13 @@ jobs: ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON BOOST_SOURCE: BUNDLED - CMAKE_ARGS: '-A x64 -DOPENSSL_ROOT_DIR=C:\Program Files\OpenSSL-Win64' - CMAKE_GENERATOR: ${{ matrix.generator }} + CMAKE_CXX_STANDARD: "17" + CMAKE_GENERATOR: Ninja CMAKE_INSTALL_LIBDIR: bin CMAKE_INSTALL_PREFIX: /usr CMAKE_UNITY_BUILD: ON + OPENSSL_ROOT_DIR: >- + C:\Program Files\OpenSSL-Win64 NPROC: 3 steps: - name: Disable Crash Dialogs @@ -254,26 +263,56 @@ jobs: - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh - - name: Build + - name: Install ccache + shell: bash + run: | + ci/scripts/install_ccache.sh 4.6.3 /usr + - name: Setup ccache + shell: bash + run: | + ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info shell: bash - run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build + run: | + echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v3 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: cpp-ccache-windows-${{ env.CACHE_VERSION }}-${{ hashFiles('cpp/**') }} + restore-keys: cpp-ccache-windows-${{ env.CACHE_VERSION }}- + env: + # We can invalidate the current cache by updating this. + CACHE_VERSION: "2022-09-13" + - name: Build + shell: cmd + run: | + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build" - name: Test shell: bash - run: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build + run: | + # For ORC + export TZDIR=/c/msys64/usr/share/zoneinfo + ci/scripts/cpp_test.sh $(pwd) $(pwd)/build windows-mingw: - name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} C++ + name: AMD64 Windows MinGW ${{ matrix.msystem_upper }} C++ runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - # Build may take 1h+ without cache and installing Google Cloud - # Storage Testbench may take 20m+ without cache. + # Build may take 1h+ without cache. timeout-minutes: 120 strategy: fail-fast: false matrix: - mingw-n-bits: - - 32 - - 64 + include: + - msystem_lower: mingw32 + msystem_upper: MINGW32 + - msystem_lower: mingw64 + msystem_upper: MINGW64 + - msystem_lower: clang64 + msystem_upper: CLANG64 env: ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF @@ -285,10 +324,9 @@ jobs: ARROW_GANDIVA: ON ARROW_GCS: ON ARROW_HDFS: OFF - ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} + ARROW_HOME: /${{ matrix.msystem_lower}} ARROW_JEMALLOC: OFF ARROW_PARQUET: ON - ARROW_PYTHON: ON ARROW_S3: ON ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF @@ -303,11 +341,12 @@ jobs: # -DBoost_NO_BOOST_CMAKE=ON BOOST_ROOT: "" CMAKE_ARGS: >- - -DARROW_PACKAGE_PREFIX=/mingw${{ matrix.mingw-n-bits }} + -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} -DBoost_NO_BOOST_CMAKE=ON # We can't use unity build because we don't have enough memory on # GitHub Actions. # CMAKE_UNITY_BUILD: ON + GTest_SOURCE: BUNDLED steps: - name: Disable Crash Dialogs run: | @@ -324,17 +363,17 @@ jobs: submodules: recursive - uses: msys2/setup-msys2@v2 with: - msystem: MINGW${{ matrix.mingw-n-bits }} + msystem: ${{ matrix.msystem_upper }} update: true - name: Setup MSYS2 shell: msys2 {0} run: ci/scripts/msys2_setup.sh cpp - name: Cache ccache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ccache - key: cpp-ccache-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('cpp/**') }} - restore-keys: cpp-ccache-mingw${{ matrix.mingw-n-bits }}- + key: cpp-ccache-${{ matrix.msystem_lower}}-${{ hashFiles('cpp/**') }} + restore-keys: cpp-ccache-${{ matrix.msystem_lower}}- - name: Build shell: msys2 {0} run: | @@ -351,23 +390,13 @@ jobs: --output-document /usr/local/bin/minio.exe \ https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - - name: Cache Python wheels - uses: actions/cache@v2 - with: - path: "${{ env.PIP_CACHE_DIR }}" - key: cpp-wheels-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('ci/scripts/install_gcs_testbench.sh') }} - restore-keys: cpp-wheels-mingw${{ matrix.mingw-n-bits }}- - name: Install Google Cloud Storage Testbench - shell: msys2 {0} + shell: bash run: | ci/scripts/install_gcs_testbench.sh default + echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV - name: Test shell: msys2 {0} run: | - python_version=$(python3 -c "import sys; print('.'.join(map(str, sys.version_info[0:2])))") - export PYTHONHOME="$(cygpath --windows ${MINGW_PREFIX})\lib\python${python_version}" - PYTHONPATH="${PYTHONHOME}" - PYTHONPATH="${PYTHONPATH};${PYTHONHOME}\lib-dynload" - PYTHONPATH="${PYTHONPATH};${PYTHONHOME}\site-packages" - export PYTHONPATH + PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}" ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 385c081cc6c..5968dded43c 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -33,6 +33,9 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + jobs: ubuntu: @@ -46,7 +49,7 @@ jobs: dotnet: ['6.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v1 + uses: actions/setup-dotnet@v2 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -74,7 +77,7 @@ jobs: dotnet: ['6.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v1 + uses: actions/setup-dotnet@v2 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -91,7 +94,7 @@ jobs: run: ci/scripts/csharp_test.sh $(pwd) macos: - name: AMD64 MacOS 10.15 C# ${{ matrix.dotnet }} + name: AMD64 macOS 11 C# ${{ matrix.dotnet }} runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 @@ -101,7 +104,7 @@ jobs: dotnet: ['6.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v1 + uses: actions/setup-dotnet@v2 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 0cff0724424..271a05979a5 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -26,9 +26,8 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} +permissions: + contents: read jobs: @@ -48,12 +47,18 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ubuntu-lint @@ -93,8 +98,11 @@ jobs: - name: Install Dependencies shell: bash run: | + gem install test-unit pip install cython setuptools six pytest jira - name: Run Release Test + env: + ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} shell: bash run: | ci/scripts/release_test.sh $(pwd) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index e39dd3f0f4d..1de6cf1b017 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -29,9 +29,15 @@ on: - edited - synchronize -# NOTE: not using the "cancel-in-progress" feature here as the group key -# does not have enough information for linking it to a particular PR +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.event.number }} + cancel-in-progress: true +permissions: + contents: read + pull-requests: write + issues: write + jobs: process: name: Process @@ -41,9 +47,8 @@ jobs: - name: Comment JIRA link if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'edited') + (github.event.action == 'opened' || + github.event.action == 'edited') uses: actions/github-script@v3 with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -53,9 +58,8 @@ jobs: - name: Check title if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'edited') + (github.event.action == 'opened' || + github.event.action == 'edited') uses: actions/github-script@v3 with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -63,25 +67,23 @@ jobs: const script = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/dev_pr/title_check.js`); script({github, context}); - - name: Check Jira Issue + - name: Check Issue if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'edited') + (github.event.action == 'opened' || + github.event.action == 'edited') uses: actions/github-script@v3 with: debug: true github-token: ${{ secrets.GITHUB_TOKEN }} script: | - const script = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/dev_pr/jira_check.js`); + const script = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/dev_pr/issue_check.js`); script({github, context}); - name: Assign GitHub labels if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'synchronize') - uses: actions/labeler@2.2.0 + (github.event.action == 'opened' || + github.event.action == 'synchronize') + uses: actions/labeler@v4 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml diff --git a/.github/workflows/dev_pr/helpers.js b/.github/workflows/dev_pr/helpers.js index d5f275d27f1..634a0cbce8b 100644 --- a/.github/workflows/dev_pr/helpers.js +++ b/.github/workflows/dev_pr/helpers.js @@ -18,34 +18,33 @@ const https = require('https'); /** - * Given the title of a PullRequest return the ID of the JIRA issue + * Given the title of a PullRequest return the Issue + * * @param {String} title - * @returns {String} the ID of the associated JIRA issue + * @returns {Issue} or null if no issue detected. + * + * @typedef {Object} Issue + * @property {string} kind - The kind of issue: minor, jira or github + * @property {string} id - The id of the issue: + * PARQUET-XXXX for jira + * The numeric issue id for github */ -function detectJIRAID(title) { +function detectIssue(title) { if (!title) { return null; } - const matched = /^(WIP:?\s*)?((ARROW|PARQUET)-\d+)/.exec(title); - if (!matched) { - return null; + if (title.startsWith("MINOR: ")) { + return {"kind": "minor"}; } - return matched[2]; -} - -/** - * Given the title of a PullRequest checks if it contains a JIRA issue ID - * @param {String} title - * @returns {Boolean} true if it starts with a JIRA ID or MINOR: - */ -function haveJIRAID(title) { - if (!title) { - return false; + const matched_jira = /^(WIP:?\s*)?((PARQUET)-\d+)/.exec(title); + if (matched_jira) { + return {"kind": "jira", "id": matched_jira[2]}; } - if (title.startsWith("MINOR: ")) { - return true; + const matched_gh = /^(WIP:?\s*)?GH-(\d+)/.exec(title); + if (matched_gh) { + return {"kind": "github", "id": matched_gh[2]}; } - return /^(WIP:?\s*)?(ARROW|PARQUET)-\d+/.test(title); + return null; } /** @@ -69,8 +68,27 @@ async function getJiraInfo(jiraID) { }); } +/** + * Retrieves information about a GitHub issue. + * @param {String} issueID + * @returns {Object} the information about a GitHub issue. + */ + async function getGitHubInfo(github, context, issueID, pullRequestNumber) { + try { + const response = await github.issues.get({ + issue_number: issueID, + owner: context.repo.owner, + repo: context.repo.repo, + }) + return response.data + } catch (error) { + console.log(`${error.name}: ${error.code}`); + return false + } +} + module.exports = { - detectJIRAID, - haveJIRAID, - getJiraInfo + detectIssue, + getJiraInfo, + getGitHubInfo }; \ No newline at end of file diff --git a/.github/workflows/dev_pr/jira_check.js b/.github/workflows/dev_pr/issue_check.js similarity index 51% rename from .github/workflows/dev_pr/jira_check.js rename to .github/workflows/dev_pr/issue_check.js index 3c294f8c7a0..3dff23f53ed 100644 --- a/.github/workflows/dev_pr/jira_check.js +++ b/.github/workflows/dev_pr/issue_check.js @@ -17,6 +17,16 @@ const helpers = require("./helpers.js"); +/** + * Performs checks on the JIRA Issue: + * - The issue is started in JIRA. + * - The issue contains components. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + * @param {String} jiraID + */ async function verifyJIRAIssue(github, context, pullRequestNumber, jiraID) { const ticketInfo = await helpers.getJiraInfo(jiraID); if(!ticketInfo["fields"]["components"].length) { @@ -30,6 +40,13 @@ async function verifyJIRAIssue(github, context, pullRequestNumber, jiraID) { } } +/** + * Adds a comment to add components on the JIRA ticket. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + */ async function commentMissingComponents(github, context, pullRequestNumber) { const {data: comments} = await github.issues.listComments({ owner: context.repo.owner, @@ -54,6 +71,13 @@ async function commentMissingComponents(github, context, pullRequestNumber) { } } +/** + * Adds a comment to start the ticket in JIRA. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + */ async function commentNotStartedTicket(github, context, pullRequestNumber) { const {data: comments} = await github.issues.listComments({ owner: context.repo.owner, @@ -78,11 +102,72 @@ async function commentNotStartedTicket(github, context, pullRequestNumber) { } } +/** + * Assigns the Github Issue to the PR creator. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + * @param {Object} issueInfo + */ +async function assignGitHubIssue(github, context, pullRequestNumber, issueInfo) { + await github.issues.addAssignees({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueInfo.number, + assignees: context.payload.pull_request.user.login + }); + await github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: ":warning: GitHub issue #" + issueInfo.number + " **has been automatically assigned in GitHub** to PR creator." + }); +} + +/** + * Performs checks on the GitHub Issue: + * - The issue is assigned to someone. If not assign it gets automatically + * assigned to the PR creator. + * - The issue contains any label. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + * @param {String} issueID + */ +async function verifyGitHubIssue(github, context, pullRequestNumber, issueID) { + const issueInfo = await helpers.getGitHubInfo(github, context, issueID, pullRequestNumber); + if (!issueInfo) { + await github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: ":x: GitHub issue #" + issueID + " could not be retrieved." + }) + } + if (!issueInfo.assignees.length) { + await assignGitHubIssue(github, context, pullRequestNumber, issueInfo); + } + if(!issueInfo.labels.filter((label) => label.name.startsWith("Component:")).length) { + await github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: ":warning: GitHub issue #" + issueID + " **has no components**, please add labels for components." + }) + } +} + module.exports = async ({github, context}) => { const pullRequestNumber = context.payload.number; const title = context.payload.pull_request.title; - const jiraID = helpers.detectJIRAID(title); - if (jiraID) { - await verifyJIRAIssue(github, context, pullRequestNumber, jiraID); + const issue = helpers.detectIssue(title) + if (issue){ + if (issue.kind == "jira") { + await verifyJIRAIssue(github, context, pullRequestNumber, issue.id); + } else if(issue.kind == "github") { + await verifyGitHubIssue(github, context, pullRequestNumber, issue.id); + } } }; diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 05d16486b76..a9a13e82a9d 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -15,53 +15,53 @@ # specific language governing permissions and limitations # under the License. -"lang-c++": +"Component: C++": - cpp/**/* -lang-c-glib: +"Component: GLib": - c_glib/**/* -lang-csharp: +"Component: C#": - csharp/**/* -lang-go: +"Component: Go": - go/**/* -lang-java: +"Component: Java": - java/**/* -lang-js: +"Component: JavaScript": - js/**/* -lang-matlab: +"Component: MATLAB": - matlab/**/* -lang-python: +"Component: Python": - python/**/* -lang-R: +"Component: R": - r/**/* -lang-ruby: +"Component: Ruby": - ruby/**/* -flight: +"Component: FlightRPC": - cpp/src/arrow/flight/**/* - r/R/flight.* - python/pyarrow/*flight.* -gandiva: +"Component: C++ - Gandiva": - c_glib/gandiva-glib/**/* - cpp/src/gandiva/**/* - ruby/red-gandiva/**/* - python/pyarrow/gandiva.* -parquet: +"Component: Parquet": - c_glib/parquet-glib/**/* - cpp/src/parquet/**/* - r/R/parquet.* - ruby/red-parquet/**/* -docs: +"Component: Documentation": - docs/**/* - "**/*.{md, rst, Rmd, Rd}" diff --git a/.github/workflows/dev_pr/link.js b/.github/workflows/dev_pr/link.js index 404ff46436f..1fbd0447175 100644 --- a/.github/workflows/dev_pr/link.js +++ b/.github/workflows/dev_pr/link.js @@ -18,7 +18,16 @@ const helpers = require("./helpers.js"); -async function haveComment(github, context, pullRequestNumber, body) { +/** + * Checks whether message is present on Pull Request list of comments. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + * @param {String} message + * @returns {Boolean} true if message was found. + */ +async function haveComment(github, context, pullRequestNumber, message) { const options = { owner: context.repo.owner, repo: context.repo.repo, @@ -27,7 +36,7 @@ async function haveComment(github, context, pullRequestNumber, body) { }; while (true) { const response = await github.issues.listComments(options); - if (response.data.some(comment => comment.body === body)) { + if (response.data.some(comment => comment.body === message)) { return true; } if (!/;\s*rel="next"/.test(response.headers.link || "")) { @@ -38,24 +47,70 @@ async function haveComment(github, context, pullRequestNumber, body) { return false; } +/** + * Adds a comment on the Pull Request linking the JIRA issue. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber + * @param {String} jiraID + */ async function commentJIRAURL(github, context, pullRequestNumber, jiraID) { + const issueInfo = await helpers.getJiraInfo(jiraID); const jiraURL = `https://issues.apache.org/jira/browse/${jiraID}`; if (await haveComment(github, context, pullRequestNumber, jiraURL)) { return; } - await github.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pullRequestNumber, - body: jiraURL - }); + if (issueInfo){ + await github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: jiraURL + }); + } +} + +/** + * Adds a comment on the Pull Request linking the GitHub issue. + * + * @param {Object} github + * @param {Object} context + * @param {String} pullRequestNumber - String containing numeric id of PR + * @param {String} issueID - String containing numeric id of the github issue + */ +async function commentGitHubURL(github, context, pullRequestNumber, issueID) { + // Make the call to ensure issue exists before adding comment + const issueInfo = await helpers.getGitHubInfo(github, context, issueID, pullRequestNumber); + const message = "* Closes: #" + issueInfo.number + if (await haveComment(github, context, pullRequestNumber, message)) { + return; + } + if (issueInfo){ + await github.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: pullRequestNumber, + body: (context.payload.pull_request.body || "") + "\n" + message + }); + await github.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: message + }); + } } module.exports = async ({github, context}) => { const pullRequestNumber = context.payload.number; const title = context.payload.pull_request.title; - const jiraID = helpers.detectJIRAID(title); - if (jiraID) { - await commentJIRAURL(github, context, pullRequestNumber, jiraID); + const issue = helpers.detectIssue(title); + if (issue){ + if (issue.kind == "jira") { + await commentJIRAURL(github, context, pullRequestNumber, issue.id); + } else if (issue.kind == "github") { + await commentGitHubURL(github, context, pullRequestNumber, issue.id); + } } }; diff --git a/.github/workflows/dev_pr/title_check.js b/.github/workflows/dev_pr/title_check.js index 392108269d8..1b7a6c5c888 100644 --- a/.github/workflows/dev_pr/title_check.js +++ b/.github/workflows/dev_pr/title_check.js @@ -18,7 +18,7 @@ const fs = require("fs"); const helpers = require("./helpers.js"); -async function commentOpenJIRAIssue(github, context, pullRequestNumber) { +async function commentOpenGitHubIssue(github, context, pullRequestNumber) { const {data: comments} = await github.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, @@ -41,7 +41,8 @@ async function commentOpenJIRAIssue(github, context, pullRequestNumber) { module.exports = async ({github, context}) => { const pullRequestNumber = context.payload.number; const title = context.payload.pull_request.title; - if (!helpers.haveJIRAID(title)) { - await commentOpenJIRAIssue(github, context, pullRequestNumber); + const issue = helpers.detectIssue(title) + if (!issue) { + await commentOpenGitHubIssue(github, context, pullRequestNumber); } }; diff --git a/.github/workflows/dev_pr/title_check.md b/.github/workflows/dev_pr/title_check.md index 1db9fcf637b..479a1f76c7d 100644 --- a/.github/workflows/dev_pr/title_check.md +++ b/.github/workflows/dev_pr/title_check.md @@ -19,18 +19,22 @@ Thanks for opening a pull request! -If this is not a [minor PR](https://github.com/apache/arrow/blob/master/CONTRIBUTING.md#Minor-Fixes). Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW +If this is not a [minor PR](https://github.com/apache/arrow/blob/master/CONTRIBUTING.md#Minor-Fixes). Could you open an issue for this pull request on GitHub? https://github.com/apache/arrow/issues/new/choose -Opening JIRAs ahead of time contributes to the [Openness](http://theapacheway.com/open/#:~:text=Openness%20allows%20new%20users%20the,must%20happen%20in%20the%20open.) of the Apache Arrow project. +Opening GitHub issues ahead of time contributes to the [Openness](http://theapacheway.com/open/#:~:text=Openness%20allows%20new%20users%20the,must%20happen%20in%20the%20open.) of the Apache Arrow project. -Then could you also rename pull request title in the following format? +Then could you also rename the pull request title in the following format? - ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} + GH-${GITHUB_ISSUE_ID}: [${COMPONENT}] ${SUMMARY} or MINOR: [${COMPONENT}] ${SUMMARY} +In the case of PARQUET issues on JIRA the title also supports: + + PARQUET-${JIRA_ISSUE_ID}: [${COMPONENT}] ${SUMMARY} + See also: * [Other pull requests](https://github.com/apache/arrow/pulls/) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 07fc2968143..27968ad28c8 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -20,11 +20,12 @@ name: Docs on: push: +permissions: + contents: read + env: ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -41,7 +42,7 @@ jobs: with: fetch-depth: 0 - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: ubuntu-docs-${{ hashFiles('cpp/**') }} @@ -53,8 +54,14 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run ubuntu-docs - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ubuntu-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index eee778e6999..ed8cd12ca36 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -28,11 +28,12 @@ on: - 'ci/scripts/cpp_build.sh' - 'ci/scripts/python_build.sh' +permissions: + contents: read + env: ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -49,7 +50,7 @@ jobs: with: fetch-depth: 0 - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: conda-docs-${{ hashFiles('cpp/**') }} @@ -61,4 +62,7 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run conda-python-docs diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cbbe067007d..2e973a2c709 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -36,9 +36,8 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} +permissions: + contents: read jobs: @@ -46,13 +45,19 @@ jobs: name: AMD64 Debian 11 Go ${{ matrix.go }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 + timeout-minutes: 60 strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest env: GO: ${{ matrix.go }} + STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -62,15 +67,38 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.10' - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run debian-go - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push debian-go + - name: Install Go ${{ matrix.go }} for Benchmarks + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + uses: actions/setup-go@v3 + with: + go-version: ${{ matrix.go }} + cache: true + cache-dependency-path: go/go.sum + - name: Run Benchmarks + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + CONBENCH_URL: https://conbench.ursa.dev + CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} + CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }} + CONBENCH_REF: ${{ github.ref_name }} + run: | + pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python + python ci/scripts/go_bench_adapt.py docker_cgo: name: AMD64 Debian 11 GO ${{ matrix.go }} - CGO @@ -80,9 +108,15 @@ jobs: strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest env: GO: ${{ matrix.go }} + STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -96,9 +130,15 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run debian-go-cgo - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push debian-go-cgo @@ -111,9 +151,15 @@ jobs: strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest env: GO: ${{ matrix.go }} + STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow uses: actions/checkout@v3 @@ -126,9 +172,15 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run debian-go-cgo-python - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push debian-go-cgo-python @@ -140,19 +192,26 @@ jobs: strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest steps: - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - name: Checkout Arrow uses: actions/checkout@v3 with: fetch-depth: 0 submodules: recursive + - name: Install go + uses: actions/setup-go@v3 + with: + go-version: ${{ matrix.go }} + cache: true + cache-dependency-path: go/go.sum - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 + run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) @@ -161,59 +220,90 @@ jobs: run: ci/scripts/go_test.sh $(pwd) macos: - name: AMD64 MacOS 10.15 Go ${{ matrix.go }} + name: AMD64 macOS 11 Go ${{ matrix.go }} runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 + timeout-minutes: 60 strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest steps: - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - name: Checkout Arrow uses: actions/checkout@v3 with: fetch-depth: 0 submodules: recursive + - name: Install go + uses: actions/setup-go@v3 + with: + go-version: ${{ matrix.go }} + cache: true + cache-dependency-path: go/go.sum - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 + run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) - name: Test shell: bash run: ci/scripts/go_test.sh $(pwd) + - name: Setup Python + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Run Benchmarks + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + shell: bash + env: + CONBENCH_URL: 'https://conbench.ursa.dev' + CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} + CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }} + CONBENCH_REF: ${{ github.ref_name }} + run: | + pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python + python ci/scripts/go_bench_adapt.py + macos-cgo: - name: AMD64 MacOS 10.15 Go ${{ matrix.go }} - CGO + name: AMD64 macOS 11 Go ${{ matrix.go }} - CGO runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: fail-fast: false matrix: - go: [1.16] + go: [1.17, 1.18] + include: + - go: 1.17 + staticcheck: v0.2.2 + - go: 1.18 + staticcheck: latest env: ARROW_GO_TESTCGO: "1" steps: - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - name: Checkout Arrow uses: actions/checkout@v3 with: fetch-depth: 0 submodules: recursive + - name: Install go + uses: actions/setup-go@v3 + with: + go-version: ${{ matrix.go }} + cache: true + cache-dependency-path: go/go.sum - name: Brew Install Arrow shell: bash run: brew install apache-arrow - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 + run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) @@ -264,11 +354,13 @@ jobs: echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV - name: Install go - uses: actions/setup-go@v2 + uses: actions/setup-go@v3 with: - go-version: '1.17' + go-version: '1.18' + cache: true + cache-dependency-path: go/go.sum - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 + run: go install honnef.co/go/tools/cmd/staticcheck@latest - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 566d43db7a9..e94eb764fd6 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -45,10 +45,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -69,7 +70,7 @@ jobs: repository: apache/arrow-rs path: rust - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: conda-${{ hashFiles('cpp/**') }} @@ -81,8 +82,18 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build - run: archery docker run -e ARCHERY_INTEGRATION_WITH_RUST=1 conda-integration + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + run: > + archery docker run \ + -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_RUST=1 \ + conda-integration - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push conda-integration diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml new file mode 100644 index 00000000000..d513f79c9fa --- /dev/null +++ b/.github/workflows/issue_bot.yml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Issue Bot + +on: + issues: + types: + - opened + +permissions: + contents: read + issues: write + +jobs: + label_components: + name: Label Components + if: github.event.issue.pull_request == null + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v6 + with: + script: | + let split_body = context.payload.issue.body.split('### Component(s)'); + if (split_body.length != 2) throw new Error('No components found!'); + + let component_labels = split_body[1] + .split(',') + .map(component => component.trim()) + .map(component => "Component: " + component); + + let repo_labels = await github.rest.issues.listLabelsForRepo({ + "owner": context.repo.owner, + "repo": context.repo.repo, + }); + + // this removes non-existent labels + component_labels = component_labels.filter( + label => repo_labels.data.some(repo_label => repo_label.name === label) + ); + + if (component_labels.length == 0) throw new Error('No components found!'); + + await github.rest.issues.addLabels({ + "owner": context.repo.owner, + "repo": context.repo.repo, + "issue_number": context.payload.issue.number, + "labels": component_labels, + }); \ No newline at end of file diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 1cba0104899..86b5799a013 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -39,10 +39,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -82,7 +83,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: maven-${{ hashFiles('java/**') }} @@ -94,14 +95,20 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run ${{ matrix.image }} - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ${{ matrix.image }} macos: - name: AMD64 MacOS 10.15 Java JDK ${{ matrix.jdk }} + name: AMD64 macOS 11 Java JDK ${{ matrix.jdk }} runs-on: macos-latest if: github.event_name == 'push' timeout-minutes: 30 @@ -111,9 +118,37 @@ jobs: jdk: [11] steps: - name: Set up Java - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 + with: + distribution: 'zulu' + java-version: ${{ matrix.jdk }} + - name: Checkout Arrow + uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: recursive + - name: Build + shell: bash + run: ci/scripts/java_build.sh $(pwd) $(pwd)/build + - name: Test + shell: bash + run: ci/scripts/java_test.sh $(pwd) $(pwd)/build + + windows: + name: AMD64 Windows Server 2022 Java JDK ${{ matrix.jdk }} + runs-on: windows-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + jdk: [11] + steps: + - name: Set up Java + uses: actions/setup-java@v3 with: java-version: ${{ matrix.jdk }} + distribution: 'temurin' - name: Checkout Arrow uses: actions/checkout@v3 with: diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 07cc3b12652..ee80c2c9d96 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -39,15 +39,16 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: docker: - name: AMD64 Debian 9 Java JNI (Gandiva, Plasma, ORC, Dataset) + name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 90 @@ -58,11 +59,11 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker - key: maven-${{ hashFiles('java/**') }} - restore-keys: maven- + key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} + restore-keys: java-jni-manylinux-2014- - name: Setup Python uses: actions/setup-python@v4 with: @@ -70,14 +71,20 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build - run: archery docker run debian-java-jni + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + run: archery docker run java-jni-manylinux-2014 - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push debian-java-jni + run: archery docker push java-jni-manylinux-2014 docker_integration_python: - name: AMD64 Debian 9 Java C Data Interface Integration + name: AMD64 Conda Java C Data Interface Integration runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 90 @@ -88,7 +95,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: maven-${{ hashFiles('java/**') }} @@ -100,8 +107,14 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: archery docker run conda-python-java-integration - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push conda-python-java-integration diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 17a49990027..24d8c7c54ee 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -24,10 +24,19 @@ on: description: Job prefix to use. required: false default: '' + keep: + description: Number of versions to keep. + required: false + default: 14 schedule: - cron: '0 14 * * *' + +permissions: + contents: read + jobs: upload: + if: github.repository == 'apache/arrow' env: PREFIX: ${{ github.event.inputs.prefix || ''}} CROSSBOW_GITHUB_TOKEN: ${{ github.token }} @@ -64,30 +73,73 @@ jobs: fi echo $PREFIX archery crossbow download-artifacts -f java-jars -t binaries $PREFIX + - name: Cache Repo + uses: actions/cache@v3 + with: + path: repo + key: java-nightly-${{ github.run_id }} + restore-keys: java-nightly + - name: Sync from Remote + uses: ./arrow/.github/actions/sync-nightlies + with: + switches: -avzh --update --delete --progress + local_path: repo + remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/java + remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} + remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} + remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} + remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} + remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} + - shell: bash + name: Show local repo sync from remote + run: | + for i in `ls -t repo/org/apache/arrow`; do + echo "- $i: $(find repo/org/apache/arrow/$i -mindepth 1 -maxdepth 1 -type d \ + | wc -l \ + | xargs) versions available" + done - shell: bash name: Build Repository run: | + DATE=$(date +%Y-%m-%d) if [ -z $PREFIX ]; then - PREFIX=nightly-packaging-$(date +%Y-%m-%d)-0 + PREFIX=nightly-packaging-${DATE}-0 fi - PATTERN_TO_GET_LIB_AND_VERSION='([a-z].+)-([0-9].[0-9].[0-9].dev[0-9]+)' + PATTERN_TO_GET_LIB_AND_VERSION='([a-z].+)-([0-9]+.[0-9]+.[0-9]+-SNAPSHOT)' mkdir -p repo/org/apache/arrow/ - for LIBRARY in $(ls binaries/$PREFIX/java-jars | grep -E '.jar|.pom' | grep dev); do + for LIBRARY in $(ls binaries/$PREFIX/java-jars | grep -E '.jar|.json|.pom|.xml' | grep SNAPSHOT); do [[ $LIBRARY =~ $PATTERN_TO_GET_LIB_AND_VERSION ]] mkdir -p repo/org/apache/arrow/${BASH_REMATCH[1]}/${BASH_REMATCH[2]} + mkdir -p repo/org/apache/arrow/${BASH_REMATCH[1]}/${DATE} + # Copy twice to maintain a latest snapshot and some earlier versions cp binaries/$PREFIX/java-jars/$LIBRARY repo/org/apache/arrow/${BASH_REMATCH[1]}/${BASH_REMATCH[2]} + touch repo/org/apache/arrow/${BASH_REMATCH[1]}/${BASH_REMATCH[2]} + cp binaries/$PREFIX/java-jars/$LIBRARY repo/org/apache/arrow/${BASH_REMATCH[1]}/${DATE} echo "Artifacts $LIBRARY configured" done + - name: Prune Repository + shell: bash + env: + KEEP: ${{ github.event.inputs.keep || 14 }} + run: | + for i in `ls -t repo/org/apache/arrow`; do + find repo/org/apache/arrow/$i -mindepth 1 -maxdepth 1 -type d -print0 \ + | xargs -0 ls -t -d \ + | tail -n +$((KEEP + 1)) \ + | xargs rm -rf + done - name: Show repo contents run: tree repo - - name: Upload Files + - name: Sync to Remote if: ${{ github.repository == 'apache/arrow' }} - uses: burnett01/rsync-deployments@5.2 + uses: ./arrow/.github/actions/sync-nightlies with: - switches: -avzr - path: repo/* + upload: true + switches: -avzh --update --delete --progress + local_path: repo remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/java remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} + remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 18d54c5b4ef..239de36eee8 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -35,9 +35,8 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} +permissions: + contents: read jobs: @@ -58,17 +57,23 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited archery docker run debian-js - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push debian-js macos: - name: AMD64 MacOS 10.15 NodeJS ${{ matrix.node }} + name: AMD64 macOS 11 NodeJS ${{ matrix.node }} runs-on: macos-latest if: github.event_name == 'push' timeout-minutes: 60 @@ -82,7 +87,7 @@ jobs: with: fetch-depth: 0 - name: Install NodeJS - uses: actions/setup-node@v1 + uses: actions/setup-node@v3 with: node-version: ${{ matrix.node }} - name: Build @@ -106,7 +111,7 @@ jobs: with: fetch-depth: 0 - name: Install NodeJS - uses: actions/setup-node@v1 + uses: actions/setup-node@v3 with: node-version: ${{ matrix.node }} - name: Build diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 3780ba113ab..541ffcea831 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -35,6 +35,9 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + jobs: ubuntu: @@ -69,11 +72,11 @@ jobs: with: select-by-folder: matlab/test macos: - name: AMD64 MacOS 10.15 MATLAB + name: AMD64 macOS 11 MATLAB runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - - name: Check out repository + - name: Check out repository uses: actions/checkout@v3 with: fetch-depth: 0 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fe834a55e6e..d6ab4006d64 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -35,10 +35,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -53,7 +54,7 @@ jobs: name: - conda-python-docs - conda-python-3.8-nopandas - - conda-python-3.7-pandas-0.23 + - conda-python-3.7-pandas-1.0 - conda-python-3.9-pandas-latest include: - name: conda-python-docs @@ -66,12 +67,12 @@ jobs: image: conda-python title: AMD64 Conda Python 3.8 Without Pandas python: 3.8 - - name: conda-python-3.7-pandas-0.23 + - name: conda-python-3.7-pandas-1.0 cache: conda-python-3.7 image: conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas 0.23 + title: AMD64 Conda Python 3.7 Pandas 1.0 python: 3.7 - pandas: 0.23 + pandas: 1.0 numpy: 1.16 - name: conda-python-3.9-pandas-latest cache: conda-python-3.9 @@ -91,7 +92,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} @@ -103,17 +104,23 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited archery docker run ${{ matrix.image }} - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ${{ matrix.image }} macos: - name: AMD64 MacOS 10.15 Python 3 + name: AMD64 macOS 12 Python 3 runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -139,29 +146,38 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_BROTLI: ON ARROW_BUILD_TESTS: OFF - CMAKE_ARGS: "-DPython3_EXECUTABLE=/usr/local/bin/python3" PYARROW_TEST_LARGE_MEMORY: ON + # Current oldest supported version according to https://endoflife.date/macos + MACOSX_DEPLOYMENT_TARGET: 10.15 steps: - name: Checkout Arrow uses: actions/checkout@v3 with: fetch-depth: 0 submodules: recursive + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' - name: Install Dependencies shell: bash run: | - rm -f /usr/local/bin/2to3 + rm -f /usr/local/bin/2to3* || : + rm -f /usr/local/bin/idle3* || : + rm -f /usr/local/bin/pydoc3* || : + rm -f /usr/local/bin/python3* || : + rm -f /usr/local/bin/python3-config || : brew update --preinstall brew install --overwrite git brew bundle --file=cpp/Brewfile brew install coreutils - python3 -mpip install \ + python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt - name: Build shell: bash run: | - export PYTHON=python3 + python -m pip install wheel ci/scripts/cpp_build.sh $(pwd) $(pwd)/build ci/scripts/python_build.sh $(pwd) $(pwd)/build - name: Test diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 4f706e3e5b1..e7b1ee06e97 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -43,10 +43,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: ubuntu: @@ -59,7 +60,7 @@ jobs: matrix: r: ["4.2"] ubuntu: [20.04] - force-tests: ["true", "false"] + force-tests: ["true"] env: R: ${{ matrix.r }} UBUNTU: ${{ matrix.ubuntu }} @@ -68,8 +69,9 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker # As this key is identical on both matrix builds only one will be able to successfully cache, @@ -87,6 +89,9 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited @@ -101,12 +106,15 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: test-output path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push ubuntu-r @@ -119,17 +127,18 @@ jobs: fail-fast: false matrix: config: - - { org: "rstudio", image: "r-base", tag: "4.0-centos7" } - - { org: "rhub", image: "debian-gcc-devel", tag: "latest" } + - { org: "rhub", image: "debian-gcc-devel", tag: "latest", devtoolset: "" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} R_TAG: ${{ matrix.config.tag }} + DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} steps: - name: Checkout Arrow uses: actions/checkout@v3 with: fetch-depth: 0 + submodules: recursive - name: Setup Python uses: actions/setup-python@v4 with: @@ -137,6 +146,9 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited @@ -152,12 +164,15 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: test-output path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true run: archery docker push r @@ -170,10 +185,6 @@ jobs: fail-fast: false matrix: config: - - { rtools: 35, arch: 'mingw32' } - - { rtools: 35, arch: 'mingw64' } - - { rtools: 40, arch: 'mingw32' } - - { rtools: 40, arch: 'mingw64' } - { rtools: 40, arch: 'ucrt64' } steps: - run: git config --global core.autocrlf false @@ -186,51 +197,32 @@ jobs: run: | ci/scripts/ccache_setup.sh echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV - # We must enable actions/cache before r-lib/actions/setup-r to ensure - # using system tar instead of tar provided by Rtools. - # We can use tar provided by Rtools when we drop support for Rtools 3.5. - # Because Rtools 4.0 or later has zstd. actions/cache requires zstd - # when tar is GNU tar. - name: Cache ccache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ccache key: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }} restore-keys: | r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}- - # We use the makepkg-mingw setup that is included in rtools40 even when - # we use the rtools35 compilers, so we always install R 4.0/Rtools40 - uses: r-lib/actions/setup-r@v2 with: r-version: "4.1" rtools-version: 40 Ncpus: 2 - - uses: r-lib/actions/setup-r@v2 - if: ${{ matrix.config.rtools == 35 }} - with: - rtools-version: 35 - r-version: "3.6" - Ncpus: 2 - name: Build Arrow C++ shell: bash env: - RTOOLS_VERSION: ${{ matrix.config.rtools }} MINGW_ARCH: ${{ matrix.config.arch }} run: ci/scripts/r_windows_build.sh - name: Rename libarrow.zip # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@v1 + - uses: actions/upload-artifact@v3 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - # We can remove this when we drop support for Rtools 3.5. - - name: Ensure using system tar in actions/cache - run: | - Write-Output "${Env:windir}\System32" | ` - Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append windows-r: needs: [windows-cpp] @@ -242,8 +234,6 @@ jobs: fail-fast: false matrix: config: - - { rtools: 35, rversion: "3.6" } - - { rtools: 40, rversion: "4.1" } - { rtools: 42, rversion: "4.2" } - { rtools: 42, rversion: "devel" } env: @@ -256,21 +246,9 @@ jobs: with: fetch-depth: 0 - run: mkdir r/windows - - name: Download artifacts - if: ${{ matrix.config.rtools != 42 }} - uses: actions/download-artifact@v2 - with: - name: libarrow-rtools${{ matrix.config.rtools }}-mingw32.zip - path: r/windows - - name: Download artifacts - if: ${{ matrix.config.rtools !=42 }} - uses: actions/download-artifact@v2 - with: - name: libarrow-rtools${{ matrix.config.rtools }}-mingw64.zip - path: r/windows - name: Download artifacts if: ${{ matrix.config.rtools == 42 }} - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: libarrow-rtools40-ucrt64.zip path: r/windows @@ -284,8 +262,6 @@ jobs: with: r-version: ${{ matrix.config.rversion }} rtools-version: ${{ matrix.config.rtools }} - # RSPM keeps install times short for 3.6 - use-public-rspm: true Ncpus: 2 - uses: r-lib/actions/setup-r-dependencies@v2 env: @@ -297,6 +273,19 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck + - name: Install MinIO + shell: bash + run: | + mkdir -p "$HOME/.local/bin" + curl \ + --output "$HOME/.local/bin/minio.exe" \ + https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z + chmod +x "$HOME/.local/bin/minio.exe" + echo "$HOME/.local/bin" >> $GITHUB_PATH + # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows + # - name: Install Google Cloud Storage Testbench + # shell: bash + # run: ci/scripts/install_gcs_testbench.sh default - name: Check shell: Rscript {0} run: | diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index a47f69136f8..8d10bee30d0 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -20,10 +20,6 @@ name: Upload R Nightly builds # to nightlies.apache.org. Due to authorization requirements, this upload can't be done # from the crossbow repository. -# This removes all permissions from the token -permissions: - contents: none - on: workflow_dispatch: inputs: @@ -40,6 +36,9 @@ on: #Crossbow packaging runs at 0 8 * * * - cron: '0 14 * * *' +permissions: + contents: read + jobs: upload: if: github.repository == 'apache/arrow' diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index bf49376c6fd..453c5a6edd2 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -47,10 +47,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + env: DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: @@ -73,7 +74,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Cache Docker Volumes - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: .docker key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} @@ -85,6 +86,9 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited @@ -97,17 +101,18 @@ jobs: ubuntu-ruby - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true shell: bash run: archery docker push ubuntu-ruby macos: - name: AMD64 MacOS 10.15 GLib & Ruby + name: AMD64 macOS 12 GLib & Ruby runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 - strategy: - fail-fast: false env: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: OFF @@ -137,7 +142,11 @@ jobs: - name: Install Homebrew Dependencies shell: bash run: | - rm -f /usr/local/bin/2to3 + rm -f /usr/local/bin/2to3* || : + rm -f /usr/local/bin/idle3* || : + rm -f /usr/local/bin/pydoc3* || : + rm -f /usr/local/bin/python3* || : + rm -f /usr/local/bin/python3-config || : brew update --preinstall brew install --overwrite git brew bundle --file=cpp/Brewfile @@ -156,9 +165,9 @@ jobs: - name: ccache info id: ccache-info run: | - echo "::set-output name=cache-dir::$(ccache --get-config cache_dir)" + echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT - name: Cache ccache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.ccache-info.outputs.cache-dir }} key: ruby-ccache-macos-${{ hashFiles('cpp/**') }} @@ -195,7 +204,8 @@ jobs: ARROW_BUILD_TYPE: release ARROW_FLIGHT: ON ARROW_FLIGHT_SQL: ON - ARROW_GANDIVA: ON + # ARROW-17728: SEGV on MinGW + ARROW_GANDIVA: OFF ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /ucrt${{ matrix.mingw-n-bits }} @@ -246,7 +256,7 @@ jobs: run: | ridk exec bash ci\scripts\msys2_setup.sh ruby - name: Cache ccache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ccache key: ruby-ccache-ucrt${{ matrix.mingw-n-bits }}-${{ hashFiles('cpp/**') }} @@ -268,9 +278,10 @@ jobs: - name: RubyGems info id: rubygems-info run: | - Write-Output "::set-output name=gem-dir::$(ridk exec gem env gemdir)" + Write-Output "gem-dir=$(ridk exec gem env gemdir)" | ` + Out-File -FilePath $env:GITHUB_OUTPUT -Encoding utf8 -Append - name: Cache RubyGems - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.rubygems-info.outputs.gem-dir }} key: ruby-rubygems-ucrt${{ matrix.mingw-n-bits }}-${{ hashFiles('**/Gemfile', 'ruby/*/*.gemspec') }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7311b5a9a3f..926be8b5175 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,3 +46,4 @@ repos: - file - python exclude: vendored + args: [--config, python/setup.cfg] diff --git a/.travis.yml b/.travis.yml index 5038f66181a..6a4c2d3c9fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -66,6 +66,7 @@ jobs: " # The LLVM's APT repository doesn't provide arm64 binaries. # We should use LLVM provided by Ubuntu. + CLANG_TOOLS: "10" LLVM: "10" UBUNTU: "20.04" @@ -109,6 +110,7 @@ jobs: " # The LLVM's APT repository causes download error for s390x binary # We should use the LLVM provided by the default APT repository + CLANG_TOOLS: "10" LLVM: "10" UBUNTU: "20.04" @@ -158,9 +160,16 @@ jobs: -e Protobuf_SOURCE=BUNDLED -e gRPC_SOURCE=BUNDLED " + # The LLVM's APT repository causes download error for s390x binary + # We should use the LLVM provided by the default APT repository + CLANG_TOOLS: "10" + LLVM: "10" + UBUNTU: "20.04" allow_failures: - name: "Java on s390x" + - name: "C++ on s390x" + - name: "Python on s390x" before_install: - eval "$(python ci/detect-changes.py)" @@ -182,6 +191,7 @@ install: - sudo -H pip3 install -e dev/archery[docker] script: + - export ARCHERY_DEFAULT_BRANCH=$(git rev-parse --abbrev-ref origin/HEAD | sed s@origin/@@) - | archery docker run \ ${DOCKER_RUN_ARGS} \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ecdf628355..e7103035204 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,435 @@ +# Apache Arrow 11.0.0 (2023-01-16 08:00:00) + +## New Features and Improvements + +* [ARROW-4709](https://issues.apache.org/jira/browse/ARROW-4709) - [C++] Optimize for ordered JSON fields (#14100) +* [ARROW-11776](https://issues.apache.org/jira/browse/ARROW-11776) - [C++][Java] Support parquet write from ArrowReader to file (#14151) +* [ARROW-13938](https://issues.apache.org/jira/browse/ARROW-13938) - [C++] Date and datetime types should autocast from strings +* [ARROW-13980](https://issues.apache.org/jira/browse/ARROW-13980) - [Go] Implement Scalar ApproxEquals (#14543) +* [ARROW-14161](https://issues.apache.org/jira/browse/ARROW-14161) - [C++][Docs] Improve Parquet C++ docs (#14018) +* [ARROW-14832](https://issues.apache.org/jira/browse/ARROW-14832) - [R] Implement bindings for stringr::str_remove and stringr::str_remove_all (#14644) +* [ARROW-14999](https://issues.apache.org/jira/browse/ARROW-14999) - [C++] Optional field name equality checks for map and list type (#14847) +* [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006) - [Python][Doc] Add five more numpydoc checks to CI (#15214) +* [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006) - [Python][CI][Doc] Enable numpydoc check PR03 (#13983) +* [ARROW-15206](https://issues.apache.org/jira/browse/ARROW-15206) - [Ruby] Add support for `Arrow::Table.load(uri, schema:)` (#15148) +* [ARROW-15460](https://issues.apache.org/jira/browse/ARROW-15460) - [R] Add as.data.frame.Dataset method (#14461) +* [ARROW-15470](https://issues.apache.org/jira/browse/ARROW-15470) - [R] Set null value in CSV writer (#14679) +* [ARROW-15538](https://issues.apache.org/jira/browse/ARROW-15538) - [C++] Expanding coverage of math functions from Substrait to Acero (#14434) +* [ARROW-15592](https://issues.apache.org/jira/browse/ARROW-15592) - [C++] Add support for custom output field names in a substrait::PlanRel (#14292) +* [ARROW-15691](https://issues.apache.org/jira/browse/ARROW-15691) - [Dev] Update archery to work with either master or main as default branch (#14033) +* [ARROW-15732](https://issues.apache.org/jira/browse/ARROW-15732) - [C++] Do not use any CPU threads in execution plan when use_threads is false (#15104) +* [ARROW-15812](https://issues.apache.org/jira/browse/ARROW-15812) - [R] Accept col_names in open_dataset for CSV (#14705) +* [ARROW-16266](https://issues.apache.org/jira/browse/ARROW-16266) - [R] Add StructArray$create() (#14922) +* [ARROW-16337](https://issues.apache.org/jira/browse/ARROW-16337) - [Python] Expose flag to enable/disable storing Arrow schema in Parquet metadata (#13000) +* [ARROW-16430](https://issues.apache.org/jira/browse/ARROW-16430) - [Python] Add support for reading record batch custom metadata API (#13041) +* [ARROW-16480](https://issues.apache.org/jira/browse/ARROW-16480) - [R] Update read_csv_arrow and open_dataset parse_options, read_options, and convert_options to take lists (#15270) +* [ARROW-16616](https://issues.apache.org/jira/browse/ARROW-16616) - [Python] Add lazy Dataset.filter() method (#13409) +* [ARROW-16673](https://issues.apache.org/jira/browse/ARROW-16673) - [Java] Integrate C Data into allocator hierarchy (#14506) +* [ARROW-16728](https://issues.apache.org/jira/browse/ARROW-16728) - [Python] ParquetDataset to still take legacy code path when old filesystem is passed (#15269) +* [ARROW-16728](https://issues.apache.org/jira/browse/ARROW-16728) - [Python] Switch default and deprecate use_legacy_dataset=True in ParquetDataset (#14052) +* [ARROW-16782](https://issues.apache.org/jira/browse/ARROW-16782) - [Format] Add REE definitions to FlatBuffers (#14176) +* [ARROW-17025](https://issues.apache.org/jira/browse/ARROW-17025) - [Dev] Remove github user name links from merge commit message (#14458) +* [ARROW-17144](https://issues.apache.org/jira/browse/ARROW-17144) - [C++][Gandiva] Add sqrt function (#13656) +* [ARROW-17187](https://issues.apache.org/jira/browse/ARROW-17187) - [R] Improve lazy ALTREP implementation for String (#14271) +* [ARROW-17212](https://issues.apache.org/jira/browse/ARROW-17212) - [Python] Support lazy Dataset.filter +* [ARROW-17301](https://issues.apache.org/jira/browse/ARROW-17301) - [C++] Implement compute function "binary_slice" (#14550) +* [ARROW-17302](https://issues.apache.org/jira/browse/ARROW-17302) - [R] Configure curl timeout policy for S3 (#15166) +* [ARROW-17360](https://issues.apache.org/jira/browse/ARROW-17360) - [Python] Order of columns in pyarrow.feather.read_table (#14528) +* [ARROW-17416](https://issues.apache.org/jira/browse/ARROW-17416) - [R] Implement lubridate::with\_tz and lubridate::force\_tz +* [ARROW-17425](https://issues.apache.org/jira/browse/ARROW-17425) - [R] `lubridate::as_datetime()` in dplyr query should be able to handle time in sub seconds (#13890) +* [ARROW-17462](https://issues.apache.org/jira/browse/ARROW-17462) - [R] Cast scalars to type of field in Expression building (#13985) +* [ARROW-17509](https://issues.apache.org/jira/browse/ARROW-17509) - [C++] Simplify async scheduler by removing the need to call End (#14524) +* [ARROW-17520](https://issues.apache.org/jira/browse/ARROW-17520) - [C++] Implement SubStrait SetRel (UnionAll) (#14186) +* [ARROW-17610](https://issues.apache.org/jira/browse/ARROW-17610) - [C++] Support additional source types in SourceNode (#14207) +* [ARROW-17613](https://issues.apache.org/jira/browse/ARROW-17613) - [C++] Add function execution API for a preconfigured kernel (#14043) +* [ARROW-17640](https://issues.apache.org/jira/browse/ARROW-17640) - [C++] Add File Handling Test cases for GlobFile handling in Substrait Read (#14132) +* [ARROW-17662](https://issues.apache.org/jira/browse/ARROW-17662) - [R] Facilitate offline installation from binaries (#14086) +* [ARROW-17726](https://issues.apache.org/jira/browse/ARROW-17726) - [CI] Enable sccache on more builds +* [ARROW-17731](https://issues.apache.org/jira/browse/ARROW-17731) - [Website] Add blog post about Flight SQL JDBC driver +* [ARROW-17732](https://issues.apache.org/jira/browse/ARROW-17732) - [Docs][Java] Add minimal JDBC driver docs (#14137) +* [ARROW-17751](https://issues.apache.org/jira/browse/ARROW-17751) - [Go][Benchmarking] Add Go Benchmark Script (#14148) +* [ARROW-17777](https://issues.apache.org/jira/browse/ARROW-17777) - [Dev] Update the pull request merge script to work with master or main +* [ARROW-17798](https://issues.apache.org/jira/browse/ARROW-17798) - [C++][Parquet] Add DELTA_BINARY_PACKED encoder to Parquet writer (#14191) +* [ARROW-17812](https://issues.apache.org/jira/browse/ARROW-17812) - [Gandiva][Docs] Add C++ Gandiva User Guide (#14200) +* [ARROW-17825](https://issues.apache.org/jira/browse/ARROW-17825) - [C++] Allow the possibility to write several tables in ORCFileWriter (#14219) +* [ARROW-17832](https://issues.apache.org/jira/browse/ARROW-17832) - [Python] Construct MapArray from sequence of dicts (instead of list of tuples) (#14547) +* [ARROW-17836](https://issues.apache.org/jira/browse/ARROW-17836) - [C++] Allow specifying alignment of buffers (#14225) +* [ARROW-17837](https://issues.apache.org/jira/browse/ARROW-17837) - [C++][Acero] Create ExecPlan-owned QueryContext that will store a plan's shared data structures (#14227) +* [ARROW-17838](https://issues.apache.org/jira/browse/ARROW-17838) - [Python] Unify CMakeLists.txt in python/ (#14925) +* [ARROW-17859](https://issues.apache.org/jira/browse/ARROW-17859) - [C++] Use self-pipe in signal-receiving StopSource (#14250) +* [ARROW-17867](https://issues.apache.org/jira/browse/ARROW-17867) - [C++][FlightRPC] Expose bulk parameter binding in Flight SQL (#14266) +* [ARROW-17870](https://issues.apache.org/jira/browse/ARROW-17870) - [Go] Add Scalar Binary Arithmetic +* [ARROW-17871](https://issues.apache.org/jira/browse/ARROW-17871) - [Go] initial binary arithmetic implementation (#14255) +* [ARROW-17887](https://issues.apache.org/jira/browse/ARROW-17887) - [R][Doc] Improve readability of the Get Started and README pages (#14514) +* [ARROW-17892](https://issues.apache.org/jira/browse/ARROW-17892) - [CI] Use Python 3.10 in AppVeyor build (#14307) +* [ARROW-17899](https://issues.apache.org/jira/browse/ARROW-17899) - [Go][CSV] Add Decimal support to CSV reader (#14504) +* [ARROW-17932](https://issues.apache.org/jira/browse/ARROW-17932) - [C++] Implement streaming RecordBatchReader for JSON (#14355) +* [ARROW-17949](https://issues.apache.org/jira/browse/ARROW-17949) - [C++][Docs] Remove the use of clcache from Windows dev docs (#14529) +* [ARROW-17953](https://issues.apache.org/jira/browse/ARROW-17953) - [Archery] Add archery docker info command (#14345) +* [ARROW-17960](https://issues.apache.org/jira/browse/ARROW-17960) - [C++][Python] Implement list_slice kernel (#14395) +* [ARROW-17966](https://issues.apache.org/jira/browse/ARROW-17966) - [C++] Adjust to new format for Substrait optional arguments (#14415) +* [ARROW-17972](https://issues.apache.org/jira/browse/ARROW-17972) - [CI] Update CUDA docker jobs (#14362) +* [ARROW-17975](https://issues.apache.org/jira/browse/ARROW-17975) - [C++] Create at-fork facility (#14594) +* [ARROW-17980](https://issues.apache.org/jira/browse/ARROW-17980) - [C++] As-of-Join Substrait extension (#14485) +* [ARROW-17989](https://issues.apache.org/jira/browse/ARROW-17989) - [C++][Python] Enable struct_field kernel to accept string field names (#14495) +* [ARROW-18008](https://issues.apache.org/jira/browse/ARROW-18008) - [Python][C++] Add use\_threads to run\_substrait\_query +* [ARROW-18012](https://issues.apache.org/jira/browse/ARROW-18012) - [R] Make map_batches .lazy = TRUE by default (#14521) +* [ARROW-18014](https://issues.apache.org/jira/browse/ARROW-18014) - [Java] Implement copy functions for vectors and Table (#14389) +* [ARROW-18016](https://issues.apache.org/jira/browse/ARROW-18016) - [CI] Add sccache to r jobs (#14570) +* [ARROW-18033](https://issues.apache.org/jira/browse/ARROW-18033) - [CI] Use $GITHUB_OUTPUT instead of set-output (#14409) +* [ARROW-18042](https://issues.apache.org/jira/browse/ARROW-18042) - [Java] Distribute Apple M1 compatible JNI libraries via mavencentral (#14472) +* [ARROW-18043](https://issues.apache.org/jira/browse/ARROW-18043) - [R] Properly instantiate empty arrays of extension types in Table__from_schema (#14519) +* [ARROW-18051](https://issues.apache.org/jira/browse/ARROW-18051) - [C++] Enable tests skipped by ARROW-16392 (#14425) +* [ARROW-18075](https://issues.apache.org/jira/browse/ARROW-18075) - [Website] Update install page for 9.0.0 +* [ARROW-18081](https://issues.apache.org/jira/browse/ARROW-18081) - [Go] Add Scalar Boolean functions (#14442) +* [ARROW-18095](https://issues.apache.org/jira/browse/ARROW-18095) - [CI][C++][MinGW] All tests exited with 0xc0000139 +* [ARROW-18108](https://issues.apache.org/jira/browse/ARROW-18108) - [Go] More scalar binary arithmetic (Multiply and Divide) (#14544) +* [ARROW-18109](https://issues.apache.org/jira/browse/ARROW-18109) - [Go] Initial Unary Arithmetic (#14605) +* [ARROW-18110](https://issues.apache.org/jira/browse/ARROW-18110) - [Go] Scalar Comparisons (#14669) +* [ARROW-18111](https://issues.apache.org/jira/browse/ARROW-18111) - [Go] Remaining scalar binary arithmetic (shifts, power, bitwise) (#14703) +* [ARROW-18112](https://issues.apache.org/jira/browse/ARROW-18112) - [Go] Remaining Scalar Arithmetic (#14777) +* [ARROW-18113](https://issues.apache.org/jira/browse/ARROW-18113) - [C++] Add RandomAccessFile::ReadManyAsync (#14723) +* [ARROW-18120](https://issues.apache.org/jira/browse/ARROW-18120) - [Release][Dev] Automate running binaries/wheels verifications (#14469) +* [ARROW-18121](https://issues.apache.org/jira/browse/ARROW-18121) - [Release][CI] Use Ubuntu 22.04 for verifying binaries (#14470) +* [ARROW-18122](https://issues.apache.org/jira/browse/ARROW-18122) - [Release][Dev] Update expected vote e-mail (#14548) +* [ARROW-18122](https://issues.apache.org/jira/browse/ARROW-18122) - [Release][Dev] Add verification PR URL to vote email (#14471) +* [ARROW-18135](https://issues.apache.org/jira/browse/ARROW-18135) - [C++] Avoid warnings that ExecBatch::length may be uninitialized (#14480) +* [ARROW-18137](https://issues.apache.org/jira/browse/ARROW-18137) - [Python][Docs] adding info about TableGroupBy.aggregation with empty list (#14482) +* [ARROW-18144](https://issues.apache.org/jira/browse/ARROW-18144) - [C++] Improve JSONTypeError error message in testing (#14486) +* [ARROW-18147](https://issues.apache.org/jira/browse/ARROW-18147) - [Go] Add Scalar Add/Sub for Decimal types (#14489) +* [ARROW-18151](https://issues.apache.org/jira/browse/ARROW-18151) - [CI] Avoid unnecessary redirect for some conda URLs (#14494) +* [ARROW-18152](https://issues.apache.org/jira/browse/ARROW-18152) - [Python] DataFrame Interchange Protocol for pyarrow Table +* [ARROW-18169](https://issues.apache.org/jira/browse/ARROW-18169) - [Website] Don't run dev docs update on fork repositories +* [ARROW-18173](https://issues.apache.org/jira/browse/ARROW-18173) - [Python] Drop older versions of Pandas (<1.0) (#14631) +* [ARROW-18174](https://issues.apache.org/jira/browse/ARROW-18174) - [R] Fix compile of altrep.cpp on some builds (#14530) +* [ARROW-18177](https://issues.apache.org/jira/browse/ARROW-18177) - [Go] Add Add/Sub for Temporal types (#14532) +* [ARROW-18178](https://issues.apache.org/jira/browse/ARROW-18178) - [Java] ArrowVectorIterator incorrectly closes Vectors (#14534) +* [ARROW-18184](https://issues.apache.org/jira/browse/ARROW-18184) - [C++] Improve JSON parser benchmarks (#14552) +* [ARROW-18203](https://issues.apache.org/jira/browse/ARROW-18203) - [R] Refactor to remove unnecessary uses of build_expr (#14553) +* [ARROW-18206](https://issues.apache.org/jira/browse/ARROW-18206) - [C++][CI] Add a nightly build for C++20 compilation (#14571) +* [ARROW-18220](https://issues.apache.org/jira/browse/ARROW-18220) - [Dev] Remove a magic number for the default parallel level in downloader (#14563) +* [ARROW-18221](https://issues.apache.org/jira/browse/ARROW-18221) - [Release][Dev] Add support for customizing arrow-site dir (#14564) +* [ARROW-18222](https://issues.apache.org/jira/browse/ARROW-18222) - [Release][MSYS2] Detect reverse dependencies automatically (#14565) +* [ARROW-18223](https://issues.apache.org/jira/browse/ARROW-18223) - [Release][Homebrew] Detect reverse dependencies automatically (#14566) +* [ARROW-18224](https://issues.apache.org/jira/browse/ARROW-18224) - [Release][jar] Use temporary directory for download (#14567) +* [ARROW-18230](https://issues.apache.org/jira/browse/ARROW-18230) - [Python] Pass Cmake args to Python CPP +* [ARROW-18233](https://issues.apache.org/jira/browse/ARROW-18233) - [Release][JS] don't install yarn to system (#14577) +* [ARROW-18235](https://issues.apache.org/jira/browse/ARROW-18235) - [C++][Gandiva] Fix the like function implementation for escape chars (#14579) +* [ARROW-18237](https://issues.apache.org/jira/browse/ARROW-18237) - [Java] Extend Table code (#14573) +* [ARROW-18238](https://issues.apache.org/jira/browse/ARROW-18238) - [Docs][Python] Improve docs for S3FileSystem (#14599) +* [ARROW-18240](https://issues.apache.org/jira/browse/ARROW-18240) - [R] head() is crashing on some nightly builds (#14582) +* [ARROW-18243](https://issues.apache.org/jira/browse/ARROW-18243) - [R] Sanitizer nightly failure pointing to mixup between TimestampType and DurationType +* [ARROW-18248](https://issues.apache.org/jira/browse/ARROW-18248) - [CI][Release] Use GitHub token to avoid API rate limit (#14588) +* [ARROW-18249](https://issues.apache.org/jira/browse/ARROW-18249) - [C++] Update vcpkg port to arrow 10.0.0 +* [ARROW-18253](https://issues.apache.org/jira/browse/ARROW-18253) - [C++][Parquet] Add additional bounds safety checks (#14592) +* [ARROW-18259](https://issues.apache.org/jira/browse/ARROW-18259) - [C++][CMake] Add support for system Thrift CMake package (#14597) +* [ARROW-18264](https://issues.apache.org/jira/browse/ARROW-18264) - [Python] Add missing value accessor to temporal types (#14746) +* [ARROW-18264](https://issues.apache.org/jira/browse/ARROW-18264) - [Python] Expose time32/time64 scalar values (#14637) +* [ARROW-18270](https://issues.apache.org/jira/browse/ARROW-18270) - [Python] Remove gcc 4.9 compatibility code (#14602) +* [ARROW-18278](https://issues.apache.org/jira/browse/ARROW-18278) - [Java] Adjust path in Maven generate-libs-jni-macos-linux (#14623) +* [ARROW-18280](https://issues.apache.org/jira/browse/ARROW-18280) - [C++][Python] Support slicing to end in list_slice kernel (#14749) +* [ARROW-18282](https://issues.apache.org/jira/browse/ARROW-18282) - [C++][Python] Support step >= 1 in list_slice kernel (#14696) +* [ARROW-18287](https://issues.apache.org/jira/browse/ARROW-18287) - [C++][CMake] Add support for Brotli/utf8proc provided by vcpkg (#14609) +* [ARROW-18289](https://issues.apache.org/jira/browse/ARROW-18289) - [Release][vcpkg] Add a script to update vcpkg's arrow port (#14610) +* [ARROW-18291](https://issues.apache.org/jira/browse/ARROW-18291) - [Release][Docs] Update how to release (#14612) +* [ARROW-18292](https://issues.apache.org/jira/browse/ARROW-18292) - [Release][Python] Upload .wheel/.tar.gz for release not RC (#14708) +* [ARROW-18303](https://issues.apache.org/jira/browse/ARROW-18303) - [Go] Allow easy compute module importing (#14690) +* [ARROW-18306](https://issues.apache.org/jira/browse/ARROW-18306) - [R] Failing test after compute function updates (#14620) +* [ARROW-18318](https://issues.apache.org/jira/browse/ARROW-18318) - [Python] Expose Scalar.validate() (#15149) +* [ARROW-18321](https://issues.apache.org/jira/browse/ARROW-18321) - [R] Add tests for binary_slice kernel (#14647) +* [ARROW-18323](https://issues.apache.org/jira/browse/ARROW-18323) - Enabling issue templates in GitHub issues (#14675) +* [ARROW-18332](https://issues.apache.org/jira/browse/ARROW-18332) - [Go] Cast Dictionary types to value type (#14650) +* [ARROW-18333](https://issues.apache.org/jira/browse/ARROW-18333) - [Go][Docs] Update compute function docs (#14815) +* [ARROW-18336](https://issues.apache.org/jira/browse/ARROW-18336) - [Release][Docs] Don't update versions not in major release (#14653) +* [ARROW-18337](https://issues.apache.org/jira/browse/ARROW-18337) - [R] Possible undesirable handling of POSIXlt objects (#15277) +* [ARROW-18340](https://issues.apache.org/jira/browse/ARROW-18340) - [Python] PyArrow C++ header files no longer always included in installed pyarrow (#14656) +* [ARROW-18341](https://issues.apache.org/jira/browse/ARROW-18341) - [Doc][Python] Update note about bundling Arrow C++ on Windows (#14660) +* [ARROW-18342](https://issues.apache.org/jira/browse/ARROW-18342) - [C++] AsofJoinNode support for Boolean data field (#14658) +* [ARROW-18345](https://issues.apache.org/jira/browse/ARROW-18345) - [R] Create a CRAN-specific packaging checklist that lives in the R package directory (#14678) +* [ARROW-18348](https://issues.apache.org/jira/browse/ARROW-18348) - [CI][Release][Yum] redhat-rpm-config is needed on AlmaLinux 9 (#14661) +* [ARROW-18350](https://issues.apache.org/jira/browse/ARROW-18350) - [C++] Use std::to_chars instead of std::to_string (#14666) +* [ARROW-18358](https://issues.apache.org/jira/browse/ARROW-18358) - [R] Implement new function open\_dataset\_csv with signature more closely matching read\_csv\_arrow +* [ARROW-18361](https://issues.apache.org/jira/browse/ARROW-18361) - [CI][Conan] Merge upstream changes (#14671) +* [ARROW-18363](https://issues.apache.org/jira/browse/ARROW-18363) - [Docs] Include warning when viewing old docs (redirecting to stable/dev docs) (#14839) +* [ARROW-18366](https://issues.apache.org/jira/browse/ARROW-18366) - [Packaging][RPM][Gandiva] Fix link error on AlmaLinux 9 (#14680) +* [ARROW-18367](https://issues.apache.org/jira/browse/ARROW-18367) - [C++] Enable the creation of named table relations (#14681) +* [ARROW-18373](https://issues.apache.org/jira/browse/ARROW-18373) - Fix component drop-down, add license text (#14688) +* [ARROW-18377](https://issues.apache.org/jira/browse/ARROW-18377) - MIGRATION: Automate component labels from issue form content (#15245) +* [ARROW-18380](https://issues.apache.org/jira/browse/ARROW-18380) - [Dev] Update dev_pr GitHub workflows to accept both GitHub issues and JIRA (#14731) +* [ARROW-18384](https://issues.apache.org/jira/browse/ARROW-18384) - [Release][MSYS2] Show pull request title (#14709) +* [ARROW-18391](https://issues.apache.org/jira/browse/ARROW-18391) - [R] Fix the version selector dropdown in the dev docs (#14800) +* [ARROW-18395](https://issues.apache.org/jira/browse/ARROW-18395) - [C++] Move select-k implementation into separate module +* [ARROW-18399](https://issues.apache.org/jira/browse/ARROW-18399) - [Python] Reduce warnings during tests (#14729) +* [ARROW-18401](https://issues.apache.org/jira/browse/ARROW-18401) - [R] Failing test on test-r-rhub-ubuntu-gcc-release-latest (#14894) +* [ARROW-18402](https://issues.apache.org/jira/browse/ARROW-18402) - [C++] Expose `DeclarationInfo` (#14765) +* [ARROW-18406](https://issues.apache.org/jira/browse/ARROW-18406) - [C++] Can't build Arrow with Substrait on Ubuntu 20.04 (#14735) +* [ARROW-18407](https://issues.apache.org/jira/browse/ARROW-18407) - [Release][Website] Use UTC for release date (#14737) +* [ARROW-18409](https://issues.apache.org/jira/browse/ARROW-18409) - [GLib][Plasma] Suppress deprecated warning in building plasma-glib (#14739) +* [ARROW-18410](https://issues.apache.org/jira/browse/ARROW-18410) - [Packaging][Ubuntu] Add support for Ubuntu 22.10 (#14740) +* [ARROW-18413](https://issues.apache.org/jira/browse/ARROW-18413) - [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742) +* [ARROW-18418](https://issues.apache.org/jira/browse/ARROW-18418) - [Website] do not delete /datafusion-python +* [ARROW-18419](https://issues.apache.org/jira/browse/ARROW-18419) - [C++] Update vendored fast_float (#14817) +* [ARROW-18420](https://issues.apache.org/jira/browse/ARROW-18420) - [C++][Parquet] Introduce ColumnIndex & OffsetIndex (#14803) +* [ARROW-18421](https://issues.apache.org/jira/browse/ARROW-18421) - [C++][ORC] Add accessor for stripe information in reader (#14806) +* [ARROW-18423](https://issues.apache.org/jira/browse/ARROW-18423) - [Python] Expose reading a schema from an IPC message (#14831) +* [ARROW-18426](https://issues.apache.org/jira/browse/ARROW-18426) - Update committers and PMC members on website +* [ARROW-18427](https://issues.apache.org/jira/browse/ARROW-18427) - [C++] Support negative tolerance in `AsofJoinNode` (#14934) +* [ARROW-18428](https://issues.apache.org/jira/browse/ARROW-18428) - [Website] Enable github issues on arrow-site repo +* [ARROW-18435](https://issues.apache.org/jira/browse/ARROW-18435) - [C++][Java] Update ORC to 1.8.1 (#14942) +* [GH-14474](https://github.com/apache/arrow/issues/14474) - Opportunistically delete R references to shared pointers where possible (#15278) +* [GH-14720](https://github.com/apache/arrow/issues/14720) - [Dev] Update merge_arrow_pr script to accept GitHub issues (#14750) +* [GH-14755](https://github.com/apache/arrow/issues/14755) - [Python] Expose QuotingStyle to Python (#14722) +* [GH-14761](https://github.com/apache/arrow/issues/14761) - [Dev] Update labels on PR labeler to use new Component ones (#14762) +* [GH-14778](https://github.com/apache/arrow/issues/14778) - [Python] Add (Chunked)Array sort() method (#14781) +* [GH-14784](https://github.com/apache/arrow/issues/14784) - [Dev] Add possibility to autoassign on GitHub issue comment (#14785) +* [GH-14786](https://github.com/apache/arrow/issues/14786) - [Java][Doc] Replace in-folder documentation (#14789) +* [GH-14787](https://github.com/apache/arrow/issues/14787) - [Java][Doc] Update table.rst (#14794) +* [GH-14809](https://github.com/apache/arrow/issues/14809) - [Dev] Add created GitHub issues to issues@arrow.apache.org (#14811) +* [GH-14816](https://github.com/apache/arrow/issues/14816) - [Release] Make dev/release/06-java-upload.sh reusable from other project (#14830) +* [GH-14824](https://github.com/apache/arrow/issues/14824) - [CI] r-binary-packages should only upload artifacts if all tests succeed (#14841) +* [GH-14844](https://github.com/apache/arrow/issues/14844) - [Java] Short circuit null checks when comparing non null field types (#15106) +* [GH-14846](https://github.com/apache/arrow/issues/14846) - [Dev] Support GitHub Releases in download_rc_binaries.py (#14848) +* [GH-14854](https://github.com/apache/arrow/issues/14854) - Make changes to .md pages (#14852) +* [GH-14869](https://github.com/apache/arrow/issues/14869) - [C++] Add Cflags.private defining _STATIC to .pc.in. (#14900) +* [GH-14873](https://github.com/apache/arrow/issues/14873) - [Java] DictionaryEncoder can decode without building a DictionaryHashTable (#14874) +* [GH-14885](https://github.com/apache/arrow/issues/14885) - [Docs] Make changes to the New Contrib Guide (Jira -> GitHub) (#14889) +* [GH-14901](https://github.com/apache/arrow/issues/14901) - [Java] ListSubfieldEncoder and StructSubfieldEncoder can decode without DictionaryHashTable (#14902) +* [GH-14918](https://github.com/apache/arrow/issues/14918) - [Docs] Make changes to developers section of the docs (Jira -> GitHub) (#14919) +* [GH-14920](https://github.com/apache/arrow/issues/14920) - [C++][CMake] Add missing -latomic to Arrow CMake package (#15251) +* [GH-14937](https://github.com/apache/arrow/issues/14937) - [C++] Add rank kernel benchmarks (#14938) +* [GH-14951](https://github.com/apache/arrow/issues/14951) - [C++][Parquet] Add benchmarks for DELTA_BINARY_PACKED encoding (#15140) +* [GH-14961](https://github.com/apache/arrow/issues/14961) - [Ruby] Use newer extpp for C++17 (#14962) +* [GH-14975](https://github.com/apache/arrow/issues/14975) - [Python] Dataset.sort_by (#14976) +* [GH-14976](https://github.com/apache/arrow/issues/14976) - [Python] Avoid dependency on exec plan in Table.sort_by to fix minimal tests (#15268) +* [GH-14977](https://github.com/apache/arrow/issues/14977) - [Dev][CI] Add notify-token-expiration to archery (#14978) +* [GH-14981](https://github.com/apache/arrow/issues/14981) - [R] Forward compatibility with dplyr::join_by() (#33664) +* [GH-14986](https://github.com/apache/arrow/issues/14986) - [Release] Don't detect previous version on maint-X.Y.Z branch (#14987) +* [GH-14992](https://github.com/apache/arrow/issues/14992) - [Packaging] Make dev/release/binary-task.rb reusable from other project (#14994) +* [GH-14997](https://github.com/apache/arrow/issues/14997) - [Release] Ensure archery release tasks works with both new style GitHub issues and old style JIRA issues (#33615) +* [GH-14999](https://github.com/apache/arrow/issues/14999) - [Release][Archery] Update archery release changelog to support GitHub issues +* [GH-15002](https://github.com/apache/arrow/issues/15002) - [Release][Archery] Update archery release cherry-pick to support GitHub issues +* [GH-15005](https://github.com/apache/arrow/issues/15005) - [Go] Add scalar.Append to append scalars to builder (#15006) +* [GH-15009](https://github.com/apache/arrow/issues/15009) - [R] stringr 1.5.0 with the str_like function is already released (#15010) +* [GH-15012](https://github.com/apache/arrow/issues/15012) - [Packaging][deb] Use system Protobuf for Debian GNU/Linux bookworm (#15013) +* [GH-15035](https://github.com/apache/arrow/issues/15035) - [CI] Remove unsupported turbodbc jobs and scripts from CI (#15036) +* [GH-15050](https://github.com/apache/arrow/issues/15050) - [Java][Docs] Update and consolidate Memory documentation (#15051) +* [GH-15072](https://github.com/apache/arrow/issues/15072) - [C++] Move the round functionality into a separate module (#15073) +* [GH-15074](https://github.com/apache/arrow/issues/15074) - [Parquet][C++] change 16-bit page_ordinal to 32-bit (#15182) +* [GH-15081](https://github.com/apache/arrow/issues/15081) - [Release] Add support for using custom artifacts directory in dev/release/05-binary-upload.sh (#15082) +* [GH-15084](https://github.com/apache/arrow/issues/15084) - [Ruby] Use common keys when keys.nil? in Table#join (#15088) +* [GH-15085](https://github.com/apache/arrow/issues/15085) - [Ruby] Add ColumnContainable#column_names (#15089) +* [GH-15087](https://github.com/apache/arrow/issues/15087) - [Release] Slow down downloading RC binaries from GitHub (#15090) +* [GH-15096](https://github.com/apache/arrow/issues/15096) - [C++] Substrait ProjectRel Emit Optimization (#15097) +* [GH-15100](https://github.com/apache/arrow/issues/15100) - [C++][Parquet] Add benchmark for reading strings from Parquet (#15101) +* [GH-15119](https://github.com/apache/arrow/issues/15119) - [Release][Docs][R] Update version information in patch release (#15120) +* [GH-15134](https://github.com/apache/arrow/issues/15134) - [Ruby] Specify -mmacox-version-min=10.14 explicitly for old Xcode (#15135) +* [GH-15146](https://github.com/apache/arrow/issues/15146) - [GLib] Add `GADatasetFinishOptions` (#15147) +* [GH-15151](https://github.com/apache/arrow/issues/15151) - [C++] Adding RecordBatchReaderSource to solve an issue in R API (#15183) +* [GH-15168](https://github.com/apache/arrow/issues/15168) - [GLib] Add support for half float (#15169) +* [GH-15174](https://github.com/apache/arrow/issues/15174) - [Go][FlightRPC] Expose Flight Server Desc and RegisterFlightService (#15177) +* [GH-15185](https://github.com/apache/arrow/issues/15185) - [C++][Parquet] Improve documentation for Parquet Reader column_indices (#15184) +* [GH-15199](https://github.com/apache/arrow/issues/15199) - [C++][Substrait] Allow AGGREGATION_INVOCATION_UNSPECIFIED as valid invocation (#15198) +* [GH-15200](https://github.com/apache/arrow/issues/15200) - [C++] Created benchmarks for round kernels. (#15201) +* [GH-15205](https://github.com/apache/arrow/issues/15205) - [R] Fix a parquet-fixture finding in R tests (#15207) +* [GH-15216](https://github.com/apache/arrow/issues/15216) - [C++][Parquet] Parquet writer accepts RecordBatch (#15240) +* [GH-15218](https://github.com/apache/arrow/issues/15218) - [Python] Remove auto generated pyarrow_api.h and pyarrow_lib.h (#15219) +* [GH-15226](https://github.com/apache/arrow/issues/15226) - [C++] Add DurationType to hash kernels (#33685) +* [GH-15237](https://github.com/apache/arrow/issues/15237) - [C++] Add ::arrow::Unreachable() using std::string_view (#15238) +* [GH-15239](https://github.com/apache/arrow/issues/15239) - [C++][Parquet] Parquet writer writes decimal as int32/64 (#15244) +* [GH-15249](https://github.com/apache/arrow/issues/15249) - [Documentation] Add PR template (#15250) +* [GH-15257](https://github.com/apache/arrow/issues/15257) - [GLib][Dataset] Add GADatasetHivePartitioning (#15272) +* [GH-15265](https://github.com/apache/arrow/issues/15265) - [Java] Publish SBOM artifacts (#15267) +* [GH-15289](https://github.com/apache/arrow/issues/15289) - [Ruby] Return self when saving Table to csv (#33653) +* [GH-15290](https://github.com/apache/arrow/issues/15290) - [C++][Compute] Optimize IfElse kernel AAS/ASA case when the scalar is null (#15291) +* [GH-33607](https://github.com/apache/arrow/issues/33607) - [C++] Support optional additional arguments for inline visit functions (#33608) +* [GH-33610](https://github.com/apache/arrow/issues/33610) - [Dev] Do not allow ARROW prefixed tickets to be merged nor used on PR titles (#33611) +* [GH-33619](https://github.com/apache/arrow/issues/33619) - [Documentation] Update PR template (#33620) +* [GH-33657](https://github.com/apache/arrow/issues/33657) - [C++] arrow-dataset.pc doesn't depend on parquet.pc without ARROW_PARQUET=ON (#33665) +* [GH-33670](https://github.com/apache/arrow/issues/33670) - [GLib] Add `GArrowProjectNodeOptions` (#33677) +* [GH-33671](https://github.com/apache/arrow/issues/33671) - [GLib] Add `garrow_chunked_array_new_empty()` (#33675) +* [PARQUET-2179](https://issues.apache.org/jira/browse/PARQUET-2179) - [C++][Parquet] Add a test for skipping repeated fields (#14366) +* [PARQUET-2188](https://issues.apache.org/jira/browse/PARQUET-2188) - [parquet-cpp] Add SkipRecords API to RecordReader (#14142) +* [PARQUET-2204](https://issues.apache.org/jira/browse/PARQUET-2204) - [parquet-cpp] TypedColumnReaderImpl::Skip should reuse scratch space (#14509) +* [PARQUET-2206](https://issues.apache.org/jira/browse/PARQUET-2206) - [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523) +* [PARQUET-2209](https://issues.apache.org/jira/browse/PARQUET-2209) - [parquet-cpp] Optimize skip for the case that number of values to skip equals page size (#14545) +* [PARQUET-2210](https://issues.apache.org/jira/browse/PARQUET-2210) - [C++][Parquet] Skip pages based on header metadata using a callback (#14603) +* [PARQUET-2211](https://issues.apache.org/jira/browse/PARQUET-2211) - [C++] Print ColumnMetaData.encoding_stats field (#14556) + + +## Bug Fixes + +* [ARROW-11631](https://issues.apache.org/jira/browse/ARROW-11631) - [R] Implement RPrimitiveConverter for Decimal type +* [ARROW-15026](https://issues.apache.org/jira/browse/ARROW-15026) - [Python] Error if datetime.timedelta to pyarrow.duration conversion overflows (#13718) +* [ARROW-15328](https://issues.apache.org/jira/browse/ARROW-15328) - [C++][Docs] Streaming CSV reader missing from documentation (#14452) +* [ARROW-15822](https://issues.apache.org/jira/browse/ARROW-15822) - [C++] Cast duration to string (thus CSV writing) not supported (#14450) +* [ARROW-16464](https://issues.apache.org/jira/browse/ARROW-16464) - [C++][CI][GPU] Add CUDA CI (#14497) +* [ARROW-16471](https://issues.apache.org/jira/browse/ARROW-16471) - [Go] RecordBuilder UnmarshalJSON handle complex values (#14560) +* [ARROW-16547](https://issues.apache.org/jira/browse/ARROW-16547) - [Python] to_pandas fails with FixedOffset timezones when timestamp_as_object is used (#14448) +* [ARROW-16795](https://issues.apache.org/jira/browse/ARROW-16795) - [C#][Flight] Nightly verify-rc-source-csharp-macos-arm64 fails (#15235) +* [ARROW-16817](https://issues.apache.org/jira/browse/ARROW-16817) - [C++] Test ORC writer errors with invalid types (#14638) +* [ARROW-17054](https://issues.apache.org/jira/browse/ARROW-17054) - [R] Creating an Array from an object bigger than 2^31 results in an Array of length 0 (#14929) +* [ARROW-17192](https://issues.apache.org/jira/browse/ARROW-17192) - [Python] Pass **kwargs in read_feather to to_pandas() (#14492) +* [ARROW-17332](https://issues.apache.org/jira/browse/ARROW-17332) - [R] error parsing folder path with accent ('c:/Público') in read_csv_arrow (#14930) +* [ARROW-17361](https://issues.apache.org/jira/browse/ARROW-17361) - [R] dplyr::summarize fails with division when divisor is a variable (#14933) +* [ARROW-17374](https://issues.apache.org/jira/browse/ARROW-17374) - [C++] Snappy package may be built without CMAKE_BUILD_TYPE (#14818) +* [ARROW-17458](https://issues.apache.org/jira/browse/ARROW-17458) - [C++] Cast between decimal and string (#14232) +* [ARROW-17538](https://issues.apache.org/jira/browse/ARROW-17538) - [C++] Import schema when importing array stream (#15037) +* [ARROW-17637](https://issues.apache.org/jira/browse/ARROW-17637) - [R][us][s] (#14935) +* [ARROW-17692](https://issues.apache.org/jira/browse/ARROW-17692) - [R] Add support for building with system AWS SDK C++ (#14235) +* [ARROW-17772](https://issues.apache.org/jira/browse/ARROW-17772) - [Doc] Sphinx / reST markup error +* [ARROW-17774](https://issues.apache.org/jira/browse/ARROW-17774) - [Python] Add python test for decimals to csv (#14525) +* [ARROW-17858](https://issues.apache.org/jira/browse/ARROW-17858) - [C++] Compilating warning in arrow/csv/parser.h (#14445) +* [ARROW-17893](https://issues.apache.org/jira/browse/ARROW-17893) - [Python] Test that reading of timedelta is stable (read_feather/to_pandas) (#14531) +* [ARROW-17985](https://issues.apache.org/jira/browse/ARROW-17985) - [C++][Python] Improve s3fs error message when wrong region (#14601) +* [ARROW-17991](https://issues.apache.org/jira/browse/ARROW-17991) - [Python][C++] Adding support for IpcWriteOptions to the dataset ipc file writer (#14414) +* [ARROW-18052](https://issues.apache.org/jira/browse/ARROW-18052) - [Python] Support passing create_dir thru pq.write_to_dataset (#14459) +* [ARROW-18068](https://issues.apache.org/jira/browse/ARROW-18068) - [Dev][Archery][Crossbow] Comment bot only waits for task if link is not available (#14429) +* [ARROW-18070](https://issues.apache.org/jira/browse/ARROW-18070) - [C++] Invoke google::protobuf::ShutdownProtobufLibrary for substrait tests (#14508) +* [ARROW-18086](https://issues.apache.org/jira/browse/ARROW-18086) - [Ruby] Add support for HalfFloat (#15204) +* [ARROW-18087](https://issues.apache.org/jira/browse/ARROW-18087) - [C++] RecordBatch::Equals should not ignore field names (#14451) +* [ARROW-18088](https://issues.apache.org/jira/browse/ARROW-18088) - [CI][Python] Fix pandas master/nightly build failure related to timedelta (#14460) +* [ARROW-18101](https://issues.apache.org/jira/browse/ARROW-18101) - [R] RecordBatchReaderHead from ExecPlan with UDF cannot be read (#14518) +* [ARROW-18106](https://issues.apache.org/jira/browse/ARROW-18106) - [C++] JSON reader ignores explicit schema with default unexpected_field_behavior="infer" (#14741) +* [ARROW-18117](https://issues.apache.org/jira/browse/ARROW-18117) - [C++] Fix static bundle build (#14465) +* [ARROW-18118](https://issues.apache.org/jira/browse/ARROW-18118) - [Release][Dev] Fix problems in 02-source.sh/03-binary-submit.sh for 10.0.0-rc0 (#14468) +* [ARROW-18123](https://issues.apache.org/jira/browse/ARROW-18123) - [Python] Fix writing files with multi-byte characters in file name (#14764) +* [ARROW-18125](https://issues.apache.org/jira/browse/ARROW-18125) - [Python] Handle pytest 8 deprecations about pytest.warns(None) +* [ARROW-18126](https://issues.apache.org/jira/browse/ARROW-18126) - [Python] Remove ARROW_BUILD_DIR in building pyarrow C++ (#14498) +* [ARROW-18128](https://issues.apache.org/jira/browse/ARROW-18128) - [Java][CI] Update timestamp of Java Nightlies X.Y.Z-SNAPSHOT folder (#14496) +* [ARROW-18149](https://issues.apache.org/jira/browse/ARROW-18149) - [C++] fix build failure of `join_example` (#14490) +* [ARROW-18157](https://issues.apache.org/jira/browse/ARROW-18157) - [Dev][Archery] "archery docker run" sets env var to None when inherited (#14501) +* [ARROW-18158](https://issues.apache.org/jira/browse/ARROW-18158) - [CI] Use default Python version when installing conda cpp environment to fix conda builds (#14500) +* [ARROW-18159](https://issues.apache.org/jira/browse/ARROW-18159) - [Go][Release] Add `go install` to verify-release script (#14503) +* [ARROW-18161](https://issues.apache.org/jira/browse/ARROW-18161) - [Ruby] Refer source input in sub objects (#15217) +* [ARROW-18164](https://issues.apache.org/jira/browse/ARROW-18164) - [Python] Honor default memory pool in Dataset scanning (#14516) +* [ARROW-18167](https://issues.apache.org/jira/browse/ARROW-18167) - [Go][Release] update go.work with release (#14522) +* [ARROW-18172](https://issues.apache.org/jira/browse/ARROW-18172) - [CI][Release] Source Release and Merge Script jobs fail on master +* [ARROW-18183](https://issues.apache.org/jira/browse/ARROW-18183) - [C++] cpp-micro benchmarks are failing on mac arm machine (#14562) +* [ARROW-18188](https://issues.apache.org/jira/browse/ARROW-18188) - [CI] CUDA nightly docker upload fails due to wrong tag (#14538) +* [ARROW-18195](https://issues.apache.org/jira/browse/ARROW-18195) - [C++] Fix case_when produces bad data when condition has nulls (#15131) +* [ARROW-18202](https://issues.apache.org/jira/browse/ARROW-18202) - [C++] Reallow regexp replace on empty string (#15132) +* [ARROW-18205](https://issues.apache.org/jira/browse/ARROW-18205) - [C++] Substrait consumer is not converting right side references correctly on joins (#14558) +* [ARROW-18207](https://issues.apache.org/jira/browse/ARROW-18207) - [Ruby] RubyGems for 10.0.0 aren't updated yet +* [ARROW-18209](https://issues.apache.org/jira/browse/ARROW-18209) - [Java] Make ComplexCopier agnostic of specific implementation of MapWriter (UnionMapWriter) (#14557) +* [ARROW-18212](https://issues.apache.org/jira/browse/ARROW-18212) - [C++] NumericBuilder::Reset() doesn't reset all members (#14559) +* [ARROW-18225](https://issues.apache.org/jira/browse/ARROW-18225) - [Python] Fully support filesystem in parquet.write_metadata (#14574) +* [ARROW-18227](https://issues.apache.org/jira/browse/ARROW-18227) - [CI][Packaging] Do not fail conda-clean if conda search raises PackagesNotFound (#14569) +* [ARROW-18229](https://issues.apache.org/jira/browse/ARROW-18229) - [Python] Check schema argument type in RecordBatchReader.from_batches (#14583) +* [ARROW-18231](https://issues.apache.org/jira/browse/ARROW-18231) - [C++][CMake] Add support for overriding optimization level (#15022) +* [ARROW-18246](https://issues.apache.org/jira/browse/ARROW-18246) - [Python][Docs] PyArrow table join docstring typos for left and right suffix arguments (#14591) +* [ARROW-18247](https://issues.apache.org/jira/browse/ARROW-18247) - [JS] fix: RangeError crash in Vector.toArray() (#14587) +* [ARROW-18256](https://issues.apache.org/jira/browse/ARROW-18256) - [C++][Windows] Use IMPORTED_IMPLIB for external shared Thrift (#14595) +* [ARROW-18257](https://issues.apache.org/jira/browse/ARROW-18257) - [Python] pass back time types with correct type class (#14633) +* [ARROW-18269](https://issues.apache.org/jira/browse/ARROW-18269) - [C++] Handle slash character in Hive-style partition values (#14646) +* [ARROW-18272](https://issues.apache.org/jira/browse/ARROW-18272) - [Python] Support filesystem parameter in ParquetFile (#14717) +* [ARROW-18284](https://issues.apache.org/jira/browse/ARROW-18284) - [Python][Docs] Add missing CMAKE_PREFIX_PATH to allow setup.py CMake invocations to find Arrow CMake package (#14586) +* [ARROW-18290](https://issues.apache.org/jira/browse/ARROW-18290) - [C++] Escape all special chars in URI-encoding (#14645) +* [ARROW-18309](https://issues.apache.org/jira/browse/ARROW-18309) - [Go] Fix delta bit packing decode panic (#14649) +* [ARROW-18320](https://issues.apache.org/jira/browse/ARROW-18320) - [C++][FlightRPC] Fix improper Status/Result conversion in Flight client (#14859) +* [ARROW-18334](https://issues.apache.org/jira/browse/ARROW-18334) - [C++] Handle potential non-commutativity by rebinding (#14659) +* [ARROW-18339](https://issues.apache.org/jira/browse/ARROW-18339) - [Python][CI] Add DYLD_LIBRARY_PATH to avoid requiring PYARROW_BUNDLE_ARROW_CPP on macOS job (#14643) +* [ARROW-18343](https://issues.apache.org/jira/browse/ARROW-18343) - [C++] Remove AllocateBitmap() with out parameter (#14657) +* [ARROW-18351](https://issues.apache.org/jira/browse/ARROW-18351) - [C++][FlightRPC] Fix crash in DoExchange with UCX (#15031) +* [ARROW-18353](https://issues.apache.org/jira/browse/ARROW-18353) - [C++][FlightRPC] Prevent concurrent Finish in UCX (#15034) +* [ARROW-18360](https://issues.apache.org/jira/browse/ARROW-18360) - [Python] Don't crash when schema=None in FlightClient.do_put (#14698) +* [ARROW-18374](https://issues.apache.org/jira/browse/ARROW-18374) - [Go][CI][Benchmarking] Fix Go benchmark github info (#14691) +* [ARROW-18374](https://issues.apache.org/jira/browse/ARROW-18374) - [Go][CI][Benchmarking] Fix Go Bench Script after Conbench change (#14689) +* [ARROW-18379](https://issues.apache.org/jira/browse/ARROW-18379) - [Python] Change warnings to _warnings in _plasma_store_entry_point (#14695) +* [ARROW-18382](https://issues.apache.org/jira/browse/ARROW-18382) - [C++] Set ADDRESS_SANITIZER in fuzzing builds (#14702) +* [ARROW-18383](https://issues.apache.org/jira/browse/ARROW-18383) - [C++] Avoid global variables for thread pools and at-fork handlers (#14704) +* [ARROW-18389](https://issues.apache.org/jira/browse/ARROW-18389) - [CI][Python] Update nightly test-conda-python-3.7-pandas-0.24 to pandas >= 1.0 (#14714) +* [ARROW-18390](https://issues.apache.org/jira/browse/ARROW-18390) - [CI][Python] Update spark test modules to match spark master (#14715) +* [ARROW-18392](https://issues.apache.org/jira/browse/ARROW-18392) - [Python] Fix test_s3fs_wrong_region; set anonymous=True (#14716) +* [ARROW-18394](https://issues.apache.org/jira/browse/ARROW-18394) - [Python][CI] Fix nightly job using pandas dev (temporarily skip tests) (#15048) +* [ARROW-18397](https://issues.apache.org/jira/browse/ARROW-18397) - [C++] Clear S3 region resolver client at S3 shutdown (#14718) +* [ARROW-18400](https://issues.apache.org/jira/browse/ARROW-18400) - [Python] Quadratic memory usage of Table.to\_pandas with nested data +* [ARROW-18405](https://issues.apache.org/jira/browse/ARROW-18405) - [Ruby] Avoid rebuilding chunked arrays in Arrow::Table.new (#14738) +* [ARROW-18412](https://issues.apache.org/jira/browse/ARROW-18412) - [C++][R] Windows build fails because of missing ChunkResolver symbols (#14774) +* [ARROW-18424](https://issues.apache.org/jira/browse/ARROW-18424) - [C++] Fix Doxygen error on ARROW_ENGINE_EXPORT (#14845) +* [ARROW-18429](https://issues.apache.org/jira/browse/ARROW-18429) - [R] : Bump dev version following 10.0.1 patch release (#14887) +* [ARROW-18436](https://issues.apache.org/jira/browse/ARROW-18436) - [C++] Ensure correct (un)escaping of special characters in URI paths (#14974) +* [ARROW-18437](https://issues.apache.org/jira/browse/ARROW-18437) - [C++][Parquet] Fix encoder for DELTA_BINARY_PACKED when flushing more than once (#14959) +* [GH-14745](https://github.com/apache/arrow/issues/14745) - [R] {rlang} dependency must be at least version 1.0.0 because of check_dots_empty (#14744) +* [GH-14775](https://github.com/apache/arrow/issues/14775) - [Go] Fix UnionBuilder.Len implementations (#14776) +* [GH-14780](https://github.com/apache/arrow/issues/14780) - [Go] Fix issues with IPC writing of sliced map/list arrays (#14793) +* [GH-14791](https://github.com/apache/arrow/issues/14791) - [JS] Fix BitmapBufferBuilder size truncation (#14881) +* [GH-14805](https://github.com/apache/arrow/issues/14805) - [Format] C Data Interface: clarify nullability of buffer pointers (#14808) +* [GH-14819](https://github.com/apache/arrow/issues/14819) - [CI][RPM] Add workaround for build failure on CentOS 9 Stream (#14820) +* [GH-14828](https://github.com/apache/arrow/issues/14828) - [CI][Conda] Sync with conda-forge, fix nightly jobs (#14832) +* [GH-14842](https://github.com/apache/arrow/issues/14842) - [C++] Propagate some errors in JSON chunker (#14843) +* [GH-14849](https://github.com/apache/arrow/issues/14849) - [CI] R install-local builds sometimes fail because sccache times out (#14850) +* [GH-14855](https://github.com/apache/arrow/issues/14855) - [C++] Support importing zero-case unions (#14857) +* [GH-14856](https://github.com/apache/arrow/issues/14856) - [CI] Azure builds fail with docker permission error (#14858) +* [GH-14865](https://github.com/apache/arrow/issues/14865) - [Go][Parquet] Address several memory leaks of buffers in pqarrow (#14878) +* [GH-14872](https://github.com/apache/arrow/issues/14872) - [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905) +* [GH-14875](https://github.com/apache/arrow/issues/14875) - [C++] C Data Interface: check imported buffer for non-null (#14814) +* [GH-14876](https://github.com/apache/arrow/issues/14876) - [Go] Handling Crashes in C Data interface (#14877) +* [GH-14883](https://github.com/apache/arrow/issues/14883) - [Go] Fix IPC encoding empty maps (#14904) +* [GH-14883](https://github.com/apache/arrow/issues/14883) - [Go] ipc.Writer leaks memory when compressing body (#14892) +* [GH-14884](https://github.com/apache/arrow/issues/14884) - [CI] R install resource may got 404 (#14893) +* [GH-14890](https://github.com/apache/arrow/issues/14890) - [Java] Fix memory leak of DictionaryEncoder when exception thrown (#14891) +* [GH-14907](https://github.com/apache/arrow/issues/14907) - [R] right_join() function does not produce the expected outcome (#15077) +* [GH-14909](https://github.com/apache/arrow/issues/14909) - [Java] Prevent potential memory leak of ListSubfieldEncoder and StructSubfieldEncoder (#14910) +* [GH-14916](https://github.com/apache/arrow/issues/14916) - [C++] Remove the API declaration about "ConcatenateBuffers" (#14915) +* [GH-14927](https://github.com/apache/arrow/issues/14927) - [Dev] Crossbow submit does not work with fine grained PATs (#14928) +* [GH-14940](https://github.com/apache/arrow/issues/14940) - [Go][Parquet] Fix Encryption Column writing (#14954) +* [GH-14943](https://github.com/apache/arrow/issues/14943) - [Python] Fix pyarrow.get_libraries() order (#14944) +* [GH-14945](https://github.com/apache/arrow/issues/14945) - [Ruby] Add support for macOS 12 / Xcode 14 (#14960) +* [GH-14947](https://github.com/apache/arrow/issues/14947) - [R] Compatibility with dplyr 1.1.0 (#14948) +* [GH-14949](https://github.com/apache/arrow/issues/14949) - [CI][Release] Output script's stdout on failure (#14957) +* [GH-14967](https://github.com/apache/arrow/issues/14967) - [R] Minimal nightly builds are failing (#14972) +* [GH-14968](https://github.com/apache/arrow/issues/14968) - [Python] Fix segfault for dataset ORC write (#15049) +* [GH-14990](https://github.com/apache/arrow/issues/14990) - [C++][Skyhook] Follow FileFormat API change (#15086) +* [GH-14993](https://github.com/apache/arrow/issues/14993) - [CI][Conda] Fix missing RECIPE_ROOT variable now expected by conda build (#15014) +* [GH-14995](https://github.com/apache/arrow/issues/14995) - [Go][FlightSQL] Fix Supported Unions Constant (#15003) +* [GH-15001](https://github.com/apache/arrow/issues/15001) - [R] Fix Parquet datatype test failure (#15197) +* [GH-15007](https://github.com/apache/arrow/issues/15007) - [CI][RPM] Ignore import failed key (#15008) +* [GH-15023](https://github.com/apache/arrow/issues/15023) - [CI][Packaging][Java] Force to use libz3.a with Homebrew (#15024) +* [GH-15025](https://github.com/apache/arrow/issues/15025) - [CI][C++][Homebrew] Ensure removing Python related commands (#15026) +* [GH-15028](https://github.com/apache/arrow/issues/15028) - [R][Docs] `NOT_CRAN` should be `"true"` instead of `TRUE` in R (#15029) +* [GH-15040](https://github.com/apache/arrow/issues/15040) - [C++] Improve pkg-config support for ARROW_BUILD_SHARED=OFF (#15075) +* [GH-15042](https://github.com/apache/arrow/issues/15042) - [C++][Parquet] Update stats on subsequent batches of dictionaries (#15179) +* [GH-15043](https://github.com/apache/arrow/issues/15043) - [Python][Docs] Update docstring for pyarrow.decompress (#15061) +* [GH-15052](https://github.com/apache/arrow/issues/15052) - [C++][Parquet] Fix DELTA_BINARY_PACKED decoder when reading only one value (#15124) +* [GH-15062](https://github.com/apache/arrow/issues/15062) - [C++] Simplify EnumParser behavior (#15063) +* [GH-15064](https://github.com/apache/arrow/issues/15064) - [Python][CI] Dask nightly tests are failing due to fsspec bug (#15065) +* [GH-15069](https://github.com/apache/arrow/issues/15069) - [C++][Python][FlightRPC] Make DoAction truly streaming (#15118) +* [GH-15080](https://github.com/apache/arrow/issues/15080) - [CI][R] Re-enable binary package job for R 4.1 on Windows (#25359) +* [GH-15092](https://github.com/apache/arrow/issues/15092) - [CI][C++][Homebrew] Ensure removing Python related commands (again) (#15093) +* [GH-15094](https://github.com/apache/arrow/issues/15094) - [CI][Release][Ruby] Install Bundler by APT (#15095) +* [GH-15110](https://github.com/apache/arrow/issues/15110) - [R][CI] Windows build fails in packaging job (#15111) +* [GH-15114](https://github.com/apache/arrow/issues/15114) - [R][C++][CI] Homebrew can't install Python 3.11 on GHA runners (#15116) +* [GH-15115](https://github.com/apache/arrow/issues/15115) - [R][CI] pyarrow tests fail on macos 10.13 due to missing pyarrow wheel (#15117) +* [GH-15122](https://github.com/apache/arrow/issues/15122) - [Benchmarking][Python] Set ARROW_INSTALL_NAME_RPATH=ON for benchmark builds (#15123) +* [GH-15126](https://github.com/apache/arrow/issues/15126) - [R] purrr::rerun was deprecated in purrr 1.0.0 (#15127) +* [GH-15136](https://github.com/apache/arrow/issues/15136) - [Python][macOS] Use `@rpath` for libarrow_python.dylib (#15143) +* [GH-15141](https://github.com/apache/arrow/issues/15141) - [C++] fix for unstable test due to unstable sort (#15142) +* [GH-15150](https://github.com/apache/arrow/issues/15150) - [C++][FlightRPC] Wait for side effects in DoAction (#15152) +* [GH-15156](https://github.com/apache/arrow/issues/15156) - [JS] Fix can't find variable: BigInt64Array (#15157) +* [GH-15172](https://github.com/apache/arrow/issues/15172) - [Python] Docstring test failure (#15186) +* [GH-15176](https://github.com/apache/arrow/issues/15176) - Fix various issues introduced in the asof-join benchmark by ARROW-17980 and ARROW-15732 (#15190) +* [GH-15189](https://github.com/apache/arrow/issues/15189) - [R] Skip S3 tests on MacOS 10.13 (#33613) +* [GH-15243](https://github.com/apache/arrow/issues/15243) - [C++] fix for potential deadlock in the group-by node (#33700) +* [GH-15254](https://github.com/apache/arrow/issues/15254) - [GLib] garrow_execute_plain_wait() checks the finished status (#15255) +* [GH-15259](https://github.com/apache/arrow/issues/15259) - [CI] component assignment fails due to typo (#15260) +* [GH-15264](https://github.com/apache/arrow/issues/15264) - [C++] Add scanner tests for disabling readahead and fix relevant bugs (#29185) +* [GH-15274](https://github.com/apache/arrow/issues/15274) - [Java][FlightRPC] handle null keystore password (#15276) +* [GH-15282](https://github.com/apache/arrow/issues/15282) - [CI][C++] add CLANG_TOOLS variable in .travis.yaml (#32972) +* [GH-15292](https://github.com/apache/arrow/issues/15292) - [C++] Typeclass alias is missing in ExtensionArray (#15293) +* [GH-25633](https://github.com/apache/arrow/issues/25633) - [CI][Java][macOS] Ensure using bundled RE2 (#33711) +* [GH-26209](https://github.com/apache/arrow/issues/26209) - [Ruby] Add support for Ruby 2.5 (#33602) +* [GH-26394](https://github.com/apache/arrow/issues/26394) - [Python] Don't use target_include_directories() for imported target (#33606) +* [GH-33626](https://github.com/apache/arrow/issues/33626) - [Packaging][RPM] Don't remove metadata for non-target arch (#33672) +* [GH-33638](https://github.com/apache/arrow/issues/33638) - [C++] Removing ExecPlan::Make deprecation warning (#33658) +* [GH-33643](https://github.com/apache/arrow/issues/33643) - [C++] Remove implicit = capture of this which is not valid in c++20 (#33644) +* [GH-33666](https://github.com/apache/arrow/issues/33666) - [R] Remove extraneous argument to semi_join (#33693) +* [GH-33667](https://github.com/apache/arrow/issues/33667) - [C++][CI] Use Ubuntu 22.04 for ASAN (#33669) +* [GH-33687](https://github.com/apache/arrow/issues/33687) - [Dev] Fix commit message generation in merge script (#33691) +* [GH-33705](https://github.com/apache/arrow/issues/33705) - [R] Fix link on README (#33706) + + + # Apache Arrow 6.0.1 (2021-11-18) ## Bug Fixes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 55e9891945d..a1c473a24bf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,42 +21,35 @@ ## Did you find a bug? -The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have -to first create an account on the -[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server -hosts bugs and issues for multiple Apache projects. The JIRA project name -for Arrow is "ARROW". - -To be assigned to an issue, ask an Arrow JIRA admin to go to -[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles), -click "Add users to a role," and add you to the "Contributor" role. Most -committers are authorized to do this; if you're a committer and aren't -able to load that project admin page, have someone else add you to the -necessary role. - -Before you create a new bug entry, we recommend you first -[search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues) -among existing Arrow issues. - -When you create a new JIRA entry, please don't forget to fill the "Component" -field. Arrow has many subcomponents and this helps triaging and filtering -tremendously. Also, we conventionally prefix the issue title with the component -name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make -lists more easy to navigate, and we'd be grateful if you did the same. +The Arrow project uses GitHub as a bug tracker. To report a bug, sign in to +your GitHub account, navigate to [GitHub issues](https://github.com/apache/arrow/issues) +and click on **New issue** . + +To be assigned to an issue, add a comment "take" to that issue. + +Before you create a new bug entry, we recommend you first search among existing +Arrow issues in +[Jira](https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20status%20%3D%20Open) +or [GitHub](https://github.com/apache/arrow/issues). + +We conventionally prefix the issue title with the component +name in brackets, such as "[C++][Python] Ensure no validity bitmap in +UnionArray::SetData", so as to make lists more easy to navigate, and +we'd be grateful if you did the same. ## Did you write a patch that fixes a bug or brings an improvement? -First create a JIRA entry as described above. Then, submit your changes -as a GitHub Pull Request. We'll ask you to prefix the pull request title -with the JIRA issue number and the component name in brackets. -(for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()"). -Respecting this convention makes it easier for us to process the backlog -of submitted Pull Requests. +First create a GitHub issue as described above, selecting **Bug Report** or +**Enhancement Request**. Then, submit your changes as a GitHub Pull Request. +We'll ask you to prefix the pull request title with the GitHub issue number +and the component name in brackets. (for example: "GH-14736: [C++][Python] +Ensure no validity bitmap in UnionArray::SetData"). Respecting this convention +makes it easier for us to process the backlog of submitted Pull Requests. ### Minor Fixes -Any functionality change should have a JIRA opened. For minor changes that -affect documentation, you do not need to open up a JIRA. Instead you can +Any functionality change should have a GitHub issue opened. For minor changes that +affect documentation, you do not need to open up a GitHub issue. Instead you can prefix the title of your PR with "MINOR: " if meets the following guidelines: * Grammar, usage and spelling fixes that affect no more than 2 files diff --git a/LICENSE.txt b/LICENSE.txt index 843cf4f6a5e..86cfaf546ca 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -653,34 +653,6 @@ SOFTWARE. -------------------------------------------------------------------------------- -The file cpp/src/arrow/vendored/string_view.hpp has the following license - -Boost Software License - Version 1.0 - August 17th, 2003 - -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. - --------------------------------------------------------------------------------- - The files in cpp/src/arrow/vendored/xxhash/ have the following license (BSD 2-Clause License) @@ -1990,12 +1962,14 @@ for PyArrow. Ibis is released under the Apache License, Version 2.0. This project includes code from the autobrew project. -* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb - are based on code from the autobrew project. +The following files are based on code from the autobrew project: +* r/tools/autobrew +* dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb Copyright (c) 2019, Jeroen Ooms License: MIT -Homepage: https://github.com/jeroen/autobrew +Homepage: https://github.com/autobrew/ -------------------------------------------------------------------------------- @@ -2057,34 +2031,6 @@ René Nyffenegger rene.nyffenegger@adp-gmbh.ch -------------------------------------------------------------------------------- -The file cpp/src/arrow/vendored/optional.hpp has the following license - -Boost Software License - Version 1.0 - August 17th, 2003 - -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. - --------------------------------------------------------------------------------- - This project includes code from Folly. * cpp/src/arrow/vendored/ProducerConsumerQueue.h diff --git a/README.md b/README.md index 7d10b81c6e4..9c4c143e6ca 100644 --- a/README.md +++ b/README.md @@ -93,12 +93,12 @@ integrations in other projects, we'd be happy to have you involved: - Join the mailing list: send an email to [dev-subscribe@arrow.apache.org][1]. Share your ideas and use cases for the project. -- [Follow our activity on JIRA][3] +- Follow our activity on [GitHub issues][3] - [Learn the format][2] - Contribute code to one of the reference implementations [1]: mailto:dev-subscribe@arrow.apache.org [2]: https://github.com/apache/arrow/tree/master/format -[3]: https://issues.apache.org/jira/browse/ARROW +[3]: https://github.com/apache/arrow/issues [4]: https://github.com/apache/arrow [5]: https://github.com/apache/arrow/blob/master/docs/source/developers/contributing.rst diff --git a/appveyor.yml b/appveyor.yml index 03a3597c9b7..fafc6952d87 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,7 +16,7 @@ # under the License. # Operating system (build VM template) -os: Visual Studio 2017 +os: Visual Studio 2019 only_commits: # Skip commits not related to Python or C++ @@ -29,42 +29,24 @@ only_commits: - python/ cache: - - C:\Users\Appveyor\clcache1 + - C:\Users\appveyor\AppData\Local\ccache matrix: fast_finish: true environment: global: - # Make these variables visible in all jobs and build steps - MSVC_DEFAULT_OPTIONS: ON APPVEYOR_SAVE_CACHE_ON_ERROR: true - # Change the clcache dir to reset caches everywhere when a setting - # is changed incompatibly (e.g. CLCACHE_COMPRESS). - CLCACHE_DIR: C:\Users\Appveyor\clcache1 - CLCACHE_SERVER: 1 - CLCACHE_COMPRESS: 1 - CLCACHE_COMPRESSLEVEL: 6 - ARROW_BUILD_FLIGHT: "OFF" - ARROW_BUILD_FLIGHT_SQL: "OFF" - ARROW_BUILD_GANDIVA: "OFF" - ARROW_LLVM_VERSION: "7.0.*" - ARROW_S3: "OFF" - PYTHON: "3.8" - ARCH: "64" + MSVC_DEFAULT_OPTIONS: ON - matrix: - # NOTE: clcache seems to work best with Ninja and worst with msbuild - # (as generated by cmake) - - JOB: "Toolchain" - GENERATOR: Ninja - ARROW_GCS: "ON" - ARROW_S3: "ON" - ARROW_BUILD_FLIGHT: "ON" - ARROW_BUILD_FLIGHT_SQL: "ON" - ARROW_BUILD_GANDIVA: "ON" - - JOB: "Build_Debug" - GENERATOR: Ninja + ARCH: "64" + ARROW_BUILD_FLIGHT: "ON" + ARROW_BUILD_FLIGHT_SQL: "ON" + ARROW_BUILD_GANDIVA: "ON" + ARROW_GCS: "ON" + ARROW_S3: "ON" + GENERATOR: Ninja + PYTHON: "3.10" before_build: - call ci\appveyor-cpp-setup.bat @@ -76,4 +58,4 @@ build_script: test: off after_build: - - clcache -s + - ccache -s diff --git a/c_glib/Brewfile b/c_glib/Brewfile index b743508f400..5ab50203696 100644 --- a/c_glib/Brewfile +++ b/c_glib/Brewfile @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -brew "autoconf-archive" -brew "glib-utils" brew "gobject-introspection" brew "gtk-doc" brew "libtool" diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp index 1e532760a27..97cab555420 100644 --- a/c_glib/arrow-dataset-glib/dataset-factory.cpp +++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -33,6 +34,8 @@ G_BEGIN_DECLS * @title: Dataset factory related classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * + * #GADatasetFinishOptions is a class for gadataset_factory_finish(). + * * #GADatasetDatasetFactory is a base class for dataset factories. * * #GADatasetFileSystemDatasetFactory is a class for @@ -41,6 +44,203 @@ G_BEGIN_DECLS * Since: 5.0.0 */ +struct GADatasetFinishOptionsPrivate { + arrow::dataset::FinishOptions options; + GArrowSchema *schema; +}; + +enum { + PROP_FINISH_OPTIONS = 1, + PROP_SCHEMA, + PROP_INSPECT_N_FRAGMENTS, + PROP_VALIDATE_FRAGMENTS, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFinishOptions, + gadataset_finish_options, + G_TYPE_OBJECT) + +#define GADATASET_FINISH_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_finish_options_get_instance_private( \ + GADATASET_FINISH_OPTIONS(obj))) + +static void +gadataset_finish_options_finalize(GObject *object) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object); + priv->options.~FinishOptions(); + G_OBJECT_CLASS(gadataset_finish_options_parent_class)->finalize(object); +} + +static void +gadataset_finish_options_dispose(GObject *object) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object); + if (priv->schema) { + g_object_unref(priv->schema); + priv->schema = nullptr; + } + G_OBJECT_CLASS(gadataset_finish_options_parent_class)->dispose(object); +} + +static void +gadataset_finish_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FINISH_OPTIONS: + { + auto arrow_finish_options = + static_cast(g_value_get_pointer(value)); + if (arrow_finish_options) { + priv->options = *arrow_finish_options; + if (priv->options.schema) { + priv->schema = garrow_schema_new_raw(&(priv->options.schema)); + } + } + } + break; + case PROP_SCHEMA: + if (priv->schema != g_value_get_object(value)) { + auto schema_previous = priv->schema; + auto schema = g_value_dup_object(value); + if (schema) { + priv->schema = GARROW_SCHEMA(schema); + priv->options.schema = garrow_schema_get_raw(priv->schema); + } else { + priv->schema = nullptr; + priv->options.schema = nullptr; + } + if (schema_previous) { + g_object_unref(schema_previous); + } + } + break; + case PROP_INSPECT_N_FRAGMENTS: + priv->options.inspect_options.fragments = g_value_get_int(value); + break; + case PROP_VALIDATE_FRAGMENTS: + priv->options.validate_fragments = g_value_get_boolean(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_finish_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCHEMA: + g_value_set_object(value, priv->schema); + break; + case PROP_INSPECT_N_FRAGMENTS: + g_value_set_int(value, priv->options.inspect_options.fragments); + break; + case PROP_VALIDATE_FRAGMENTS: + g_value_set_boolean(value, priv->options.validate_fragments); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_finish_options_init(GADatasetFinishOptions *object) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(object); + new(&priv->options) arrow::dataset::FinishOptions; +} + +static void +gadataset_finish_options_class_init(GADatasetFinishOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_finish_options_finalize; + gobject_class->dispose = gadataset_finish_options_dispose; + gobject_class->set_property = gadataset_finish_options_set_property; + gobject_class->get_property = gadataset_finish_options_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("finish-options", + "Finish options", + "The raw arrow::dataset::FinishOptions *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FINISH_OPTIONS, spec); + + /** + * GADatasetFinishOptions:schema: + * + * The schema to finalize the dataset's schema. + * + * Since: 11.0.0 + */ + spec = g_param_spec_object("schema", + "Schema", + "The schema to finalize the dataset's schema", + GARROW_TYPE_SCHEMA, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + + arrow::dataset::FinishOptions finish_options; + /** + * GADatasetFinishOptions:inspect-n-fragments: + * + * The number of fragments to be used to inspect schema. + * + * Since: 11.0.0 + */ + spec = g_param_spec_int("inspect-n-fragments", + "Inspect N fragments", + "The number of fragments to be used to inspect schema", + arrow::dataset::InspectOptions::kInspectAllFragments, + G_MAXINT, + finish_options.inspect_options.fragments, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_INSPECT_N_FRAGMENTS, spec); + + /** + * GADatasetFinishOptions:validate-fragments: + * + * Whether validate fragments against the given schema or not. + * + * Since: 11.0.0 + */ + spec = g_param_spec_boolean("validate-fragments", + "Validate fragments", + "Whether validate fragments or not", + finish_options.validate_fragments, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_VALIDATE_FRAGMENTS, spec); +} + +/** + * gadataset_finish_options_new: + * + * Returns: A newly created #GADatasetDataset. + * + * Since: 11.0.0 + */ +GADatasetFinishOptions * +gadataset_finish_options_new(void) +{ + return gadataset_finish_options_new_raw(nullptr); +} + + typedef struct GADatasetDatasetFactoryPrivate_ { std::shared_ptr factory; } GADatasetDatasetFactoryPrivate; @@ -118,6 +318,7 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) /** * gadataset_dataset_factory_finish: * @factory: A #GADatasetDatasetFactory. + * @options: (nullable): A #GADatasetFinishOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (transfer full) (nullable): @@ -127,10 +328,15 @@ gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) */ GADatasetDataset * gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GADatasetFinishOptions *options, GError **error) { auto arrow_factory = gadataset_dataset_factory_get_raw(factory); - auto arrow_dataset_result = arrow_factory->Finish(); + arrow::dataset::FinishOptions arrow_options; + if (options) { + arrow_options = *gadataset_finish_options_get_raw(options); + } + auto arrow_dataset_result = arrow_factory->Finish(arrow_options); if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) { auto arrow_dataset = *arrow_dataset_result; return gadataset_dataset_new_raw(&arrow_dataset); @@ -474,6 +680,7 @@ gadataset_file_system_dataset_factory_add_path( /** * gadataset_file_system_dataset_factory_finish: * @factory: A #GADatasetFileSystemDatasetFactory. + * @options: (nullable): A #GADatasetFinishOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (transfer full) (nullable): @@ -484,6 +691,7 @@ gadataset_file_system_dataset_factory_add_path( GADatasetFileSystemDataset * gadataset_file_system_dataset_factory_finish( GADatasetFileSystemDatasetFactory *factory, + GADatasetFinishOptions *options, GError **error) { const gchar *context = "[file-system-dataset-factory][finish]"; @@ -527,7 +735,11 @@ gadataset_file_system_dataset_factory_finish( if (!garrow::check(error, arrow_factory_result, context)) { return NULL; } - auto arrow_dataset_result = (*arrow_factory_result)->Finish(); + arrow::dataset::FinishOptions arrow_options; + if (options) { + arrow_options = *gadataset_finish_options_get_raw(options); + } + auto arrow_dataset_result = (*arrow_factory_result)->Finish(arrow_options); if (!garrow::check(error, arrow_dataset_result, context)) { return NULL; } @@ -544,6 +756,21 @@ gadataset_file_system_dataset_factory_finish( G_END_DECLS +GADatasetFinishOptions * +gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *options) +{ + return GADATASET_FINISH_OPTIONS(g_object_new(GADATASET_TYPE_FINISH_OPTIONS, + "finish-options", options, + NULL)); +} + +arrow::dataset::FinishOptions * +gadataset_finish_options_get_raw(GADatasetFinishOptions *options) +{ + auto priv = GADATASET_FINISH_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} + std::shared_ptr gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory) { diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h index e2ee3ed9806..292a9ca70dd 100644 --- a/c_glib/arrow-dataset-glib/dataset-factory.h +++ b/c_glib/arrow-dataset-glib/dataset-factory.h @@ -23,6 +23,21 @@ G_BEGIN_DECLS +#define GADATASET_TYPE_FINISH_OPTIONS (gadataset_finish_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFinishOptions, + gadataset_finish_options, + GADATASET, + FINISH_OPTIONS, + GObject) +struct _GADatasetFinishOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GADatasetFinishOptions * +gadataset_finish_options_new(void); + #define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory, gadataset_dataset_factory, @@ -37,6 +52,7 @@ struct _GADatasetDatasetFactoryClass GARROW_AVAILABLE_IN_5_0 GADatasetDataset * gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GADatasetFinishOptions *options, GError **error); @@ -92,6 +108,7 @@ GARROW_AVAILABLE_IN_5_0 GADatasetFileSystemDataset * gadataset_file_system_dataset_factory_finish( GADatasetFileSystemDatasetFactory *factory, + GADatasetFinishOptions *options, GError **error); diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp index 114db35bc59..6ff68945ad1 100644 --- a/c_glib/arrow-dataset-glib/dataset-factory.hpp +++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp @@ -23,5 +23,10 @@ #include +GADatasetFinishOptions * +gadataset_finish_options_new_raw(arrow::dataset::FinishOptions *arrow_options); +arrow::dataset::FinishOptions * +gadataset_finish_options_get_raw(GADatasetFinishOptions *options); + std::shared_ptr gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory); diff --git a/c_glib/arrow-dataset-glib/partitioning.cpp b/c_glib/arrow-dataset-glib/partitioning.cpp index bce33671a35..296895ebaab 100644 --- a/c_glib/arrow-dataset-glib/partitioning.cpp +++ b/c_glib/arrow-dataset-glib/partitioning.cpp @@ -32,67 +32,80 @@ G_BEGIN_DECLS * @title: Partitioning classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADatasetPartitioningOptions is a class for partitioning options. + * #GADatasetPartitioningFactoryOptions is a class for partitioning + * factory options. * * #GADatasetPartitioning is a base class for partitioning classes * such as #GADatasetDirectoryPartitioning. * + * #GADatasetDefaultPartitioning is a class for partitioning that + * doesn't partition. + * + * #GADatasetKeyValuePartitioningOptions is a class for key-value + * partitioning options. + * * #GADatasetKeyValuePartitioning is a base class for key-value style * partitioning classes such as #GADatasetDirectoryPartitioning. * * #GADatasetDirectoryPartitioning is a class for partitioning that * uses directory structure. * + * #GADatasetHivePartitioningOptions is a class for Hive-style + * partitioning options. + * + * #GADatasetHivePartitioning is a class for partitioning that + * uses Hive-style partitioning. + * * Since: 6.0.0 */ -typedef struct GADatasetPartitioningOptionsPrivate_ { +struct GADatasetPartitioningFactoryOptionsPrivate { gboolean infer_dictionary; GArrowSchema *schema; GADatasetSegmentEncoding segment_encoding; -} GADatasetPartitioningOptionsPrivate; +}; enum { - PROP_INFER_DICTIONARY = 1, - PROP_SCHEMA, - PROP_SEGMENT_ENCODING, + PROP_FACTORY_OPTIONS_INFER_DICTIONARY = 1, + PROP_FACTORY_OPTIONS_SCHEMA, + PROP_FACTORY_OPTIONS_SEGMENT_ENCODING, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioningOptions, - gadataset_partitioning_options, +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioningFactoryOptions, + gadataset_partitioning_factory_options, G_TYPE_OBJECT) -#define GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ - static_cast( \ - gadataset_partitioning_options_get_instance_private( \ - GADATASET_PARTITIONING_OPTIONS(obj))) +#define GADATASET_PARTITIONING_FACTORY_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_partitioning_factory_options_get_instance_private( \ + GADATASET_PARTITIONING_FACTORY_OPTIONS(obj))) static void -gadataset_partitioning_options_dispose(GObject *object) +gadataset_partitioning_factory_options_dispose(GObject *object) { - auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_PARTITIONING_FACTORY_OPTIONS_GET_PRIVATE(object); if (priv->schema) { g_object_unref(priv->schema); priv->schema = nullptr; } - G_OBJECT_CLASS(gadataset_partitioning_options_parent_class)->dispose(object); + G_OBJECT_CLASS(gadataset_partitioning_factory_options_parent_class)->dispose(object); } static void -gadataset_partitioning_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_partitioning_factory_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_PARTITIONING_FACTORY_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_INFER_DICTIONARY: + case PROP_FACTORY_OPTIONS_INFER_DICTIONARY: priv->infer_dictionary = g_value_get_boolean(value); break; - case PROP_SCHEMA: + case PROP_FACTORY_OPTIONS_SCHEMA: { auto schema = g_value_get_object(value); if (priv->schema == schema) { @@ -103,14 +116,14 @@ gadataset_partitioning_options_set_property(GObject *object, g_object_ref(schema); priv->schema = GARROW_SCHEMA(schema); } else { - priv->schema = NULL; + priv->schema = nullptr; } if (old_schema) { g_object_unref(old_schema); } } break; - case PROP_SEGMENT_ENCODING: + case PROP_FACTORY_OPTIONS_SEGMENT_ENCODING: priv->segment_encoding = static_cast(g_value_get_enum(value)); break; @@ -121,21 +134,21 @@ gadataset_partitioning_options_set_property(GObject *object, } static void -gadataset_partitioning_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_partitioning_factory_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_PARTITIONING_FACTORY_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_INFER_DICTIONARY: + case PROP_FACTORY_OPTIONS_INFER_DICTIONARY: g_value_set_boolean(value, priv->infer_dictionary); break; - case PROP_SCHEMA: + case PROP_FACTORY_OPTIONS_SCHEMA: g_value_set_object(value, priv->schema); break; - case PROP_SEGMENT_ENCODING: + case PROP_FACTORY_OPTIONS_SEGMENT_ENCODING: g_value_set_enum(value, priv->segment_encoding); break; default: @@ -145,24 +158,27 @@ gadataset_partitioning_options_get_property(GObject *object, } static void -gadataset_partitioning_options_init(GADatasetPartitioningOptions *object) +gadataset_partitioning_factory_options_init( + GADatasetPartitioningFactoryOptions *object) { } static void -gadataset_partitioning_options_class_init( - GADatasetPartitioningOptionsClass *klass) +gadataset_partitioning_factory_options_class_init( + GADatasetPartitioningFactoryOptionsClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - gobject_class->dispose = gadataset_partitioning_options_dispose; - gobject_class->set_property = gadataset_partitioning_options_set_property; - gobject_class->get_property = gadataset_partitioning_options_get_property; + gobject_class->dispose = gadataset_partitioning_factory_options_dispose; + gobject_class->set_property = + gadataset_partitioning_factory_options_set_property; + gobject_class->get_property = + gadataset_partitioning_factory_options_get_property; arrow::dataset::PartitioningFactoryOptions default_options; GParamSpec *spec; /** - * GADatasetPartitioningOptions:infer-dictionary: + * GADatasetPartitioningFactoryOptions:infer-dictionary: * * When inferring a schema for partition fields, yield dictionary * encoded types instead of plain. This can be more efficient when @@ -170,7 +186,7 @@ gadataset_partitioning_options_class_init( * finished Partitioning will include dictionaries of all unique * inspected values for each field. * - * Since: 6.0.0 + * Since: 11.0.0 */ spec = g_param_spec_boolean("infer-dictionary", "Infer dictionary", @@ -178,16 +194,18 @@ gadataset_partitioning_options_class_init( "dictionary", default_options.infer_dictionary, static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_INFER_DICTIONARY, spec); + g_object_class_install_property(gobject_class, + PROP_FACTORY_OPTIONS_INFER_DICTIONARY, + spec); /** - * GADatasetPartitioningOptions:schema: + * GADatasetPartitioningFactoryOptions:schema: * * Optionally, an expected schema can be provided, in which case * inference will only check discovered fields against the schema * and update internal state (such as dictionaries). * - * Since: 6.0.0 + * Since: 11.0.0 */ spec = g_param_spec_object("schema", "Schema", @@ -195,15 +213,17 @@ gadataset_partitioning_options_class_init( "against the schema and update internal state", GARROW_TYPE_SCHEMA, static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + g_object_class_install_property(gobject_class, + PROP_FACTORY_OPTIONS_SCHEMA, + spec); /** - * GADatasetPartitioningOptions:segment-encoding: + * GADatasetPartitioningFactoryOptions:segment-encoding: * * After splitting a path into components, decode the path * components before parsing according to this scheme. * - * Since: 6.0.0 + * Since: 11.0.0 */ spec = g_param_spec_enum("segment-encoding", "Segment encoding", @@ -214,36 +234,38 @@ gadataset_partitioning_options_class_init( static_cast( default_options.segment_encoding), static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_SEGMENT_ENCODING, spec); + g_object_class_install_property(gobject_class, + PROP_FACTORY_OPTIONS_SEGMENT_ENCODING, + spec); } /** - * gadataset_partitioning_options_new: + * gadataset_partitioning_factory_options_new: * - * Returns: The newly created #GADatasetPartitioningOptions. + * Returns: The newly created #GADatasetPartitioningFactoryOptions. * - * Since: 6.0.0 + * Since: 11.0.0 */ -GADatasetPartitioningOptions * -gadataset_partitioning_options_new(void) +GADatasetPartitioningFactoryOptions * +gadataset_partitioning_factory_options_new(void) { - return GADATASET_PARTITIONING_OPTIONS( - g_object_new(GADATASET_TYPE_PARTITIONING_OPTIONS, - NULL)); + return GADATASET_PARTITIONING_FACTORY_OPTIONS( + g_object_new(GADATASET_TYPE_PARTITIONING_FACTORY_OPTIONS, + nullptr)); } -typedef struct GADatasetPartitioningPrivate_ { +struct GADatasetPartitioningPrivate { std::shared_ptr partitioning; -} GADatasetPartitioningPrivate; +}; enum { PROP_PARTITIONING = 1, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioning, - gadataset_partitioning, - G_TYPE_OBJECT) +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetPartitioning, + gadataset_partitioning, + G_TYPE_OBJECT) #define GADATASET_PARTITIONING_GET_PRIVATE(obj) \ static_cast( \ @@ -303,24 +325,6 @@ gadataset_partitioning_class_init(GADatasetPartitioningClass *klass) g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); } -/** - * gadataset_partitioning_new: - * - * Returns: The newly created #GADatasetPartitioning that doesn't - * partition. - * - * Since: 6.0.0 - */ -GADatasetPartitioning * -gadataset_partitioning_new(void) -{ - auto arrow_partitioning = arrow::dataset::Partitioning::Default(); - return GADATASET_PARTITIONING( - g_object_new(GADATASET_TYPE_PARTITIONING, - "partitioning", &arrow_partitioning, - NULL)); -} - /** * gadataset_partitioning_get_type_name: * @partitioning: A #GADatasetPartitioning. @@ -341,10 +345,153 @@ gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) } -G_DEFINE_TYPE(GADatasetKeyValuePartitioning, - gadataset_key_value_partitioning, +G_DEFINE_TYPE(GADatasetDefaultPartitioning, + gadataset_default_partitioning, GADATASET_TYPE_PARTITIONING) +static void +gadataset_default_partitioning_init(GADatasetDefaultPartitioning *object) +{ +} + +static void +gadataset_default_partitioning_class_init( + GADatasetDefaultPartitioningClass *klass) +{ +} + +/** + * gadataset_default_partitioning_new: + * + * Returns: The newly created #GADatasetDefaultPartitioning that + * doesn't partition. + * + * Since: 11.0.0 + */ +GADatasetDefaultPartitioning * +gadataset_default_partitioning_new(void) +{ + auto arrow_partitioning = arrow::dataset::Partitioning::Default(); + return GADATASET_DEFAULT_PARTITIONING( + gadataset_partitioning_new_raw(&arrow_partitioning)); +} + + +struct GADatasetKeyValuePartitioningOptionsPrivate { + GADatasetSegmentEncoding segment_encoding; +}; + +enum { + PROP_OPTIONS_SEGMENT_ENCODING = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetKeyValuePartitioningOptions, + gadataset_key_value_partitioning_options, + G_TYPE_OBJECT) + +#define GADATASET_KEY_VALUE_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_key_value_partitioning_options_get_instance_private( \ + GADATASET_KEY_VALUE_PARTITIONING_OPTIONS(obj))) + +static void +gadataset_key_value_partitioning_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_KEY_VALUE_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS_SEGMENT_ENCODING: + priv->segment_encoding = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_key_value_partitioning_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_KEY_VALUE_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS_SEGMENT_ENCODING: + g_value_set_enum(value, priv->segment_encoding); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_key_value_partitioning_options_init( + GADatasetKeyValuePartitioningOptions *object) +{ +} + +static void +gadataset_key_value_partitioning_options_class_init( + GADatasetKeyValuePartitioningOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = + gadataset_key_value_partitioning_options_set_property; + gobject_class->get_property = + gadataset_key_value_partitioning_options_get_property; + + arrow::dataset::KeyValuePartitioningOptions default_options; + GParamSpec *spec; + /** + * GADatasetKeyValuePartitioningOptions:segment-encoding: + * + * After splitting a path into components, decode the path + * components before parsing according to this scheme. + * + * Since: 11.0.0 + */ + spec = g_param_spec_enum("segment-encoding", + "Segment encoding", + "After splitting a path into components, " + "decode the path components before " + "parsing according to this scheme", + GADATASET_TYPE_SEGMENT_ENCODING, + static_cast( + default_options.segment_encoding), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_OPTIONS_SEGMENT_ENCODING, + spec); +} + +/** + * gadataset_key_value_partitioning_options_new: + * + * Returns: The newly created #GADatasetKeyValuePartitioningOptions. + * + * Since: 11.0.0 + */ +GADatasetKeyValuePartitioningOptions * +gadataset_key_value_partitioning_options_new(void) +{ + return GADATASET_KEY_VALUE_PARTITIONING_OPTIONS( + g_object_new(GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS, + nullptr)); +} + + +G_DEFINE_ABSTRACT_TYPE(GADatasetKeyValuePartitioning, + gadataset_key_value_partitioning, + GADATASET_TYPE_PARTITIONING) + static void gadataset_key_value_partitioning_init(GADatasetKeyValuePartitioning *object) { @@ -356,6 +503,34 @@ gadataset_key_value_partitioning_class_init( { } +G_END_DECLS +template +GADatasetPartitioning * +garrow_key_value_partitioning_new( + GArrowSchema *schema, + GList *dictionaries, + PartitioningOptions &arrow_options, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_dictionaries; + for (auto node = dictionaries; node; node = node->next) { + auto dictionary = GARROW_ARRAY(node->data); + if (dictionary) { + arrow_dictionaries.push_back(garrow_array_get_raw(dictionary)); + } else { + arrow_dictionaries.push_back(nullptr); + } + } + auto arrow_partitioning = + std::static_pointer_cast( + std::make_shared( + arrow_schema, + arrow_dictionaries, + arrow_options)); + return gadataset_partitioning_new_raw(&arrow_partitioning); +} +G_BEGIN_DECLS G_DEFINE_TYPE(GADatasetDirectoryPartitioning, gadataset_directory_partitioning, @@ -377,7 +552,7 @@ gadataset_directory_partitioning_class_init( * @schema: A #GArrowSchema that describes all partitioned segments. * @dictionaries: (nullable) (element-type GArrowArray): A list of #GArrowArray * for dictionary data types in @schema. - * @options: (nullable): A #GADatasetPartitioningOptions. + * @options: (nullable): A #GADatasetKeyValuePartitioningOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: The newly created #GADatasetDirectoryPartitioning on success, @@ -386,52 +561,269 @@ gadataset_directory_partitioning_class_init( * Since: 6.0.0 */ GADatasetDirectoryPartitioning * -gadataset_directory_partitioning_new(GArrowSchema *schema, - GList *dictionaries, - GADatasetPartitioningOptions *options, - GError **error) +gadataset_directory_partitioning_new( + GArrowSchema *schema, + GList *dictionaries, + GADatasetKeyValuePartitioningOptions *options, + GError **error) { - auto arrow_schema = garrow_schema_get_raw(schema); - std::vector> arrow_dictionaries; - for (auto node = dictionaries; node; node = node->next) { - auto dictionary = GARROW_ARRAY(node->data); - if (dictionary) { - arrow_dictionaries.push_back(garrow_array_get_raw(dictionary)); - } else { - arrow_dictionaries.push_back(nullptr); + arrow::dataset::KeyValuePartitioningOptions arrow_options; + if (options) { + arrow_options = gadataset_key_value_partitioning_options_get_raw(options); + } + return GADATASET_DIRECTORY_PARTITIONING( + garrow_key_value_partitioning_new( + schema, dictionaries, arrow_options, error)); +} + + +struct GADatasetHivePartitioningOptionsPrivate { + gchar *null_fallback; +}; + +enum { + PROP_OPTIONS_NULL_FALLBACK = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetHivePartitioningOptions, + gadataset_hive_partitioning_options, + GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS) + +#define GADATASET_HIVE_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_hive_partitioning_options_get_instance_private( \ + GADATASET_HIVE_PARTITIONING_OPTIONS(obj))) + +static void +gadataset_hive_partitioning_options_finalize(GObject *object) +{ + auto priv = GADATASET_HIVE_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + if (priv->null_fallback) { + g_free(priv->null_fallback); + priv->null_fallback = nullptr; + } + + G_OBJECT_CLASS(gadataset_hive_partitioning_options_parent_class)->finalize(object); +} + +static void +gadataset_hive_partitioning_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_HIVE_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS_NULL_FALLBACK: + if (priv->null_fallback == g_value_get_string(value)) { + break; } + if (priv->null_fallback) { + g_free(priv->null_fallback); + } + priv->null_fallback = g_value_dup_string(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; } - arrow::dataset::KeyValuePartitioningOptions arrow_options; +} + +static void +gadataset_hive_partitioning_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_HIVE_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS_NULL_FALLBACK: + g_value_set_string(value, priv->null_fallback); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_hive_partitioning_options_init( + GADatasetHivePartitioningOptions *object) +{ +} + +static void +gadataset_hive_partitioning_options_class_init( + GADatasetHivePartitioningOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_hive_partitioning_options_finalize; + gobject_class->set_property = gadataset_hive_partitioning_options_set_property; + gobject_class->get_property = gadataset_hive_partitioning_options_get_property; + + arrow::dataset::HivePartitioningOptions default_options; + GParamSpec *spec; + /** + * GADatasetHivePartitioningOptions:null-fallback: + * + * The fallback string for null. This is used only by + * #GADatasetHivePartitioning. + * + * Since: 11.0.0 + */ + spec = g_param_spec_string("null-fallback", + "Null fallback", + "The fallback string for null", + default_options.null_fallback.c_str(), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_OPTIONS_NULL_FALLBACK, + spec); +} + +/** + * gadataset_hive_partitioning_options_new: + * + * Returns: The newly created #GADatasetHivePartitioningOptions. + * + * Since: 11.0.0 + */ +GADatasetHivePartitioningOptions * +gadataset_hive_partitioning_options_new(void) +{ + return GADATASET_HIVE_PARTITIONING_OPTIONS( + g_object_new(GADATASET_TYPE_HIVE_PARTITIONING_OPTIONS, + nullptr)); +} + + +G_DEFINE_TYPE(GADatasetHivePartitioning, + gadataset_hive_partitioning, + GADATASET_TYPE_KEY_VALUE_PARTITIONING) + +static void +gadataset_hive_partitioning_init(GADatasetHivePartitioning *object) +{ +} + +static void +gadataset_hive_partitioning_class_init( + GADatasetHivePartitioningClass *klass) +{ +} + +/** + * gadataset_hive_partitioning_new: + * @schema: A #GArrowSchema that describes all partitioned segments. + * @dictionaries: (nullable) (element-type GArrowArray): A list of #GArrowArray + * for dictionary data types in @schema. + * @options: (nullable): A #GADatasetHivePartitioningOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: The newly created #GADatasetHivePartitioning on success, + * %NULL on error. + * + * Since: 11.0.0 + */ +GADatasetHivePartitioning * +gadataset_hive_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetHivePartitioningOptions *options, + GError **error) +{ + arrow::dataset::HivePartitioningOptions arrow_options; if (options) { - arrow_options = - gadataset_partitioning_options_get_raw_key_value_partitioning_options( - options); + arrow_options = gadataset_hive_partitioning_options_get_raw(options); } + return GADATASET_HIVE_PARTITIONING( + garrow_key_value_partitioning_new( + schema, dictionaries, arrow_options, error)); +} + +/** + * gadataset_hive_partitioning_get_null_fallback: + * + * Returns: The fallback string for null. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 11.0.0 + */ +gchar * +gadataset_hive_partitioning_get_null_fallback( + GADatasetHivePartitioning *partitioning) +{ auto arrow_partitioning = - std::make_shared( - arrow_schema, - arrow_dictionaries, - arrow_options); - return GADATASET_DIRECTORY_PARTITIONING( - g_object_new(GADATASET_TYPE_DIRECTORY_PARTITIONING, - "partitioning", &arrow_partitioning, - NULL)); + std::static_pointer_cast( + gadataset_partitioning_get_raw(GADATASET_PARTITIONING(partitioning))); + return g_strdup(arrow_partitioning->null_fallback().c_str()); } G_END_DECLS +arrow::dataset::PartitioningFactoryOptions +gadataset_partitioning_factory_options_get_raw( + GADatasetPartitioningFactoryOptions *options) +{ + auto priv = GADATASET_PARTITIONING_FACTORY_OPTIONS_GET_PRIVATE(options); + arrow::dataset::PartitioningFactoryOptions arrow_options; + arrow_options.infer_dictionary = priv->infer_dictionary; + if (priv->schema) { + arrow_options.schema = garrow_schema_get_raw(priv->schema); + } + arrow_options.segment_encoding = + static_cast(priv->segment_encoding); + return arrow_options; +} + arrow::dataset::KeyValuePartitioningOptions -gadataset_partitioning_options_get_raw_key_value_partitioning_options( - GADatasetPartitioningOptions *options) +gadataset_key_value_partitioning_options_get_raw( + GADatasetKeyValuePartitioningOptions *options) { - auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(options); + auto priv = GADATASET_KEY_VALUE_PARTITIONING_OPTIONS_GET_PRIVATE(options); arrow::dataset::KeyValuePartitioningOptions arrow_options; arrow_options.segment_encoding = static_cast(priv->segment_encoding); return arrow_options; } +arrow::dataset::HivePartitioningOptions +gadataset_hive_partitioning_options_get_raw( + GADatasetHivePartitioningOptions *options) +{ + auto priv = GADATASET_HIVE_PARTITIONING_OPTIONS_GET_PRIVATE(options); + auto arrow_key_value_options = + gadataset_key_value_partitioning_options_get_raw( + GADATASET_KEY_VALUE_PARTITIONING_OPTIONS(options)); + arrow::dataset::HivePartitioningOptions arrow_options; + arrow_options.segment_encoding = arrow_key_value_options.segment_encoding; + arrow_options.null_fallback = priv->null_fallback; + return arrow_options; +} + +GADatasetPartitioning * +gadataset_partitioning_new_raw( + std::shared_ptr *arrow_partitioning) +{ + GType type = GADATASET_TYPE_PARTITIONING; + const auto arrow_type_name = (*arrow_partitioning)->type_name(); + if (arrow_type_name == "default") { + type = GADATASET_TYPE_DEFAULT_PARTITIONING; + } else if (arrow_type_name == "directory") { + type = GADATASET_TYPE_DIRECTORY_PARTITIONING; + } else if (arrow_type_name == "hive") { + type = GADATASET_TYPE_HIVE_PARTITIONING; + } + return GADATASET_PARTITIONING(g_object_new(type, + "partitioning", arrow_partitioning, + nullptr)); +} + std::shared_ptr gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning) { diff --git a/c_glib/arrow-dataset-glib/partitioning.h b/c_glib/arrow-dataset-glib/partitioning.h index d408d9bd502..5872735d202 100644 --- a/c_glib/arrow-dataset-glib/partitioning.h +++ b/c_glib/arrow-dataset-glib/partitioning.h @@ -38,21 +38,21 @@ typedef enum { } GADatasetSegmentEncoding; -#define GADATASET_TYPE_PARTITIONING_OPTIONS \ - (gadataset_partitioning_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioningOptions, - gadataset_partitioning_options, +#define GADATASET_TYPE_PARTITIONING_FACTORY_OPTIONS \ + (gadataset_partitioning_factory_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioningFactoryOptions, + gadataset_partitioning_factory_options, GADATASET, - PARTITIONING_OPTIONS, + PARTITIONING_FACTORY_OPTIONS, GObject) -struct _GADatasetPartitioningOptionsClass +struct _GADatasetPartitioningFactoryOptionsClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 -GADatasetPartitioningOptions * -gadataset_partitioning_options_new(void); +GARROW_AVAILABLE_IN_11_0 +GADatasetPartitioningFactoryOptions * +gadataset_partitioning_factory_options_new(void); #define GADATASET_TYPE_PARTITIONING (gadataset_partitioning_get_type()) @@ -66,14 +66,45 @@ struct _GADatasetPartitioningClass GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_6_0 -GADatasetPartitioning * -gadataset_partitioning_new(void); GARROW_AVAILABLE_IN_6_0 gchar * gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); +#define GADATASET_TYPE_DEFAULT_PARTITIONING \ + (gadataset_default_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDefaultPartitioning, + gadataset_default_partitioning, + GADATASET, + DEFAULT_PARTITIONING, + GADatasetPartitioning) +struct _GADatasetDefaultPartitioningClass +{ + GADatasetPartitioningClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GADatasetDefaultPartitioning * +gadataset_default_partitioning_new(void); + + +#define GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS \ + (gadataset_key_value_partitioning_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioningOptions, + gadataset_key_value_partitioning_options, + GADATASET, + KEY_VALUE_PARTITIONING_OPTIONS, + GObject) +struct _GADatasetKeyValuePartitioningOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GADatasetKeyValuePartitioningOptions * +gadataset_key_value_partitioning_options_new(void); + + #define GADATASET_TYPE_KEY_VALUE_PARTITIONING \ (gadataset_key_value_partitioning_get_type()) G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioning, @@ -101,10 +132,52 @@ struct _GADatasetDirectoryPartitioningClass GARROW_AVAILABLE_IN_6_0 GADatasetDirectoryPartitioning * -gadataset_directory_partitioning_new(GArrowSchema *schema, - GList *dictionaries, - GADatasetPartitioningOptions *options, - GError **error); +gadataset_directory_partitioning_new( + GArrowSchema *schema, + GList *dictionaries, + GADatasetKeyValuePartitioningOptions *options, + GError **error); + + +#define GADATASET_TYPE_HIVE_PARTITIONING_OPTIONS \ + (gadataset_hive_partitioning_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetHivePartitioningOptions, + gadataset_hive_partitioning_options, + GADATASET, + HIVE_PARTITIONING_OPTIONS, + GADatasetKeyValuePartitioningOptions) +struct _GADatasetHivePartitioningOptionsClass +{ + GADatasetKeyValuePartitioningOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GADatasetHivePartitioningOptions * +gadataset_hive_partitioning_options_new(void); + + +#define GADATASET_TYPE_HIVE_PARTITIONING \ + (gadataset_hive_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetHivePartitioning, + gadataset_hive_partitioning, + GADATASET, + HIVE_PARTITIONING, + GADatasetKeyValuePartitioning) +struct _GADatasetHivePartitioningClass +{ + GADatasetKeyValuePartitioningClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GADatasetHivePartitioning * +gadataset_hive_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetHivePartitioningOptions *options, + GError **error); +GARROW_AVAILABLE_IN_11_0 +gchar * +gadataset_hive_partitioning_get_null_fallback( + GADatasetHivePartitioning *partitioning); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/partitioning.hpp b/c_glib/arrow-dataset-glib/partitioning.hpp index 2481ecb3340..4ce8667e789 100644 --- a/c_glib/arrow-dataset-glib/partitioning.hpp +++ b/c_glib/arrow-dataset-glib/partitioning.hpp @@ -23,9 +23,21 @@ #include +arrow::dataset::PartitioningFactoryOptions +gadataset_partitioning_factory_options_get_raw( + GADatasetPartitioningFactoryOptions *options); + arrow::dataset::KeyValuePartitioningOptions -gadataset_partitioning_options_get_raw_key_value_partitioning_options( - GADatasetPartitioningOptions *options); +gadataset_key_value_partitioning_options_get_raw( + GADatasetKeyValuePartitioningOptions *options); + +arrow::dataset::HivePartitioningOptions +gadataset_hive_partitioning_options_get_raw( + GADatasetHivePartitioningOptions *options); + +GADatasetPartitioning * +gadataset_partitioning_new_raw( + std::shared_ptr *arrow_partitioning); std::shared_ptr gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning); diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp index 40bad8b496f..4af1bf60d47 100644 --- a/c_glib/arrow-flight-glib/server.cpp +++ b/c_glib/arrow-flight-glib/server.cpp @@ -17,7 +17,7 @@ * under the License. */ -#include +#include #include @@ -239,7 +239,7 @@ gaflight_record_batch_stream_new(GArrowRecordBatchReader *reader, } else { arrow_options = &arrow_options_default; } - auto stream = arrow::internal::make_unique< + auto stream = std::make_unique< arrow::flight::RecordBatchStream>(arrow_reader, *arrow_options); return static_cast( g_object_new(GAFLIGHT_TYPE_RECORD_BATCH_STREAM, @@ -484,7 +484,7 @@ namespace gaflight { g_object_unref(gaflight); } g_list_free(gaflights); - *listing = arrow::internal::make_unique< + *listing = std::make_unique< arrow::flight::SimpleFlightListing>(flights); return arrow::Status::OK(); } @@ -507,7 +507,7 @@ namespace gaflight { arrow::StatusCode::UnknownError, "[flight-server][get-flight-info]"); } - *info = arrow::internal::make_unique( + *info = std::make_unique( *gaflight_info_get_raw(gainfo)); g_object_unref(gainfo); return arrow::Status::OK(); @@ -531,7 +531,7 @@ namespace gaflight { arrow::StatusCode::UnknownError, "[flight-server][do-get]"); } - *stream = arrow::internal::make_unique(gastream); + *stream = std::make_unique(gastream); return arrow::Status::OK(); } diff --git a/c_glib/arrow-flight-sql-glib/server.cpp b/c_glib/arrow-flight-sql-glib/server.cpp index 32fdc85e9bf..51cdb22ab5d 100644 --- a/c_glib/arrow-flight-sql-glib/server.cpp +++ b/c_glib/arrow-flight-sql-glib/server.cpp @@ -17,7 +17,7 @@ * under the License. */ -#include +#include #include #include @@ -225,7 +225,7 @@ namespace gaflightsql { arrow::StatusCode::UnknownError, context); } - return arrow::internal::make_unique( + return std::make_unique( *gaflight_info_get_raw(gainfo)); } @@ -247,7 +247,7 @@ namespace gaflightsql { arrow::StatusCode::UnknownError, "[flight-sql-server][do-get-statement]"); } - return arrow::internal::make_unique(gastream); + return std::make_unique(gastream); } private: diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 3483dfe2095..aa7bee20e98 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -402,6 +402,9 @@ G_BEGIN_DECLS * #GArrowUInt64ArrayBuilder is the class to create a new * #GArrowUInt64Array. * + * #GArrowHalfFloatArrayBuilder is the class to creating a new + * #GArrowHalfFloatArray. + * * #GArrowFloatArrayBuilder is the class to creating a new * #GArrowFloatArray. * @@ -2599,6 +2602,99 @@ garrow_uint64_array_builder_append_nulls(GArrowUInt64ArrayBuilder *builder, } +G_DEFINE_TYPE(GArrowHalfFloatArrayBuilder, + garrow_half_float_array_builder, + GARROW_TYPE_ARRAY_BUILDER) + +static void +garrow_half_float_array_builder_init(GArrowHalfFloatArrayBuilder *builder) +{ +} + +static void +garrow_half_float_array_builder_class_init( + GArrowHalfFloatArrayBuilderClass *klass) +{ +} + +/** + * garrow_half_float_array_builder_new: + * + * Returns: A newly created #GArrowHalfFloatArrayBuilder. + * + * Since: 11.0.0 + */ +GArrowHalfFloatArrayBuilder * +garrow_half_float_array_builder_new(void) +{ + auto builder = garrow_array_builder_new(arrow::float16(), + nullptr, + "[half-float-array-builder][new]"); + return GARROW_HALF_FLOAT_ARRAY_BUILDER(builder); +} + +/** + * garrow_half_float_array_builder_append_value: + * @builder: A #GArrowHalfFloatArrayBuilder. + * @value: A 16-bit float value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 11.0.0 + */ +gboolean +garrow_half_float_array_builder_append_value( + GArrowHalfFloatArrayBuilder *builder, + guint16 value, + GError **error) +{ + return garrow_array_builder_append_value + (GARROW_ARRAY_BUILDER(builder), + value, + error, + "[half-float-array-builder][append-value]"); +} + +/** + * garrow_half_float_array_builder_append_values: + * @builder: A #GArrowHalfFloatArrayBuilder. + * @values: (array length=values_length): The array of 16-bit float. + * @values_length: The length of `values`. + * @is_valids: (nullable) (array length=is_valids_length): The array of + * boolean that shows whether the Nth value is valid or not. If the + * Nth `is_valids` is %TRUE, the Nth `values` is valid value. Otherwise + * the Nth value is null value. + * @is_valids_length: The length of `is_valids`. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Append multiple values at once. It's more efficient than multiple + * `append` and `append_null` calls. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 11.0.0 + */ +gboolean +garrow_half_float_array_builder_append_values( + GArrowHalfFloatArrayBuilder *builder, + const guint16 *values, + gint64 values_length, + const gboolean *is_valids, + gint64 is_valids_length, + GError **error) +{ + return garrow_array_builder_append_values + (GARROW_ARRAY_BUILDER(builder), + values, + values_length, + is_valids, + is_valids_length, + error, + "[half-float-array-builder][append-values]"); +} + + G_DEFINE_TYPE(GArrowFloatArrayBuilder, garrow_float_array_builder, GARROW_TYPE_ARRAY_BUILDER) @@ -6494,6 +6590,9 @@ garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, case arrow::Type::type::INT64: type = GARROW_TYPE_INT64_ARRAY_BUILDER; break; + case arrow::Type::type::HALF_FLOAT: + type = GARROW_TYPE_HALF_FLOAT_ARRAY_BUILDER; + break; case arrow::Type::type::FLOAT: type = GARROW_TYPE_FLOAT_ARRAY_BUILDER; break; diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index aa7d36cfbe1..741390739f7 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -550,6 +550,38 @@ gboolean garrow_uint64_array_builder_append_nulls(GArrowUInt64ArrayBuilder *buil #endif +#define GARROW_TYPE_HALF_FLOAT_ARRAY_BUILDER \ + (garrow_half_float_array_builder_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatArrayBuilder, + garrow_half_float_array_builder, + GARROW, + HALF_FLOAT_ARRAY_BUILDER, + GArrowArrayBuilder) +struct _GArrowHalfFloatArrayBuilderClass +{ + GArrowArrayBuilderClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GArrowHalfFloatArrayBuilder * +garrow_half_float_array_builder_new(void); + +GARROW_AVAILABLE_IN_11_0 +gboolean +garrow_half_float_array_builder_append_value( + GArrowHalfFloatArrayBuilder *builder, + guint16 value, + GError **error); +GARROW_AVAILABLE_IN_11_0 +gboolean garrow_half_float_array_builder_append_values( + GArrowHalfFloatArrayBuilder *builder, + const guint16 *values, + gint64 values_length, + const gboolean *is_valids, + gint64 is_valids_length, + GError **error); + + #define GARROW_TYPE_FLOAT_ARRAY_BUILDER (garrow_float_array_builder_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowFloatArrayBuilder, garrow_float_array_builder, diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index ee2197fad69..388f5cc168c 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -1844,6 +1844,83 @@ garrow_uint64_array_get_values(GArrowUInt64Array *array, } +G_DEFINE_TYPE(GArrowHalfFloatArray, + garrow_half_float_array, + GARROW_TYPE_NUMERIC_ARRAY) + +static void +garrow_half_float_array_init(GArrowHalfFloatArray *object) +{ +} + +static void +garrow_half_float_array_class_init(GArrowHalfFloatArrayClass *klass) +{ +} + +/** + * garrow_half_float_array_new: + * @length: The number of elements. + * @data: The binary data in Arrow format of the array. + * @null_bitmap: (nullable): The bitmap that shows null elements. The + * N-th element is null when the N-th bit is 0, not null otherwise. + * If the array has no null elements, the bitmap must be %NULL and + * @n_nulls is 0. + * @n_nulls: The number of null elements. If -1 is specified, the + * number of nulls are computed from @null_bitmap. + * + * Returns: A newly created #GArrowHalfFloatArray. + * + * Since: 11.0.0 + */ +GArrowHalfFloatArray * +garrow_half_float_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls) +{ + auto array = garrow_primitive_array_new(length, + data, + null_bitmap, + n_nulls); + return GARROW_HALF_FLOAT_ARRAY(array); +} + +/** + * garrow_half_float_array_get_value: + * @array: A #GArrowHalfFloatArray. + * @i: The index of the target value. + * + * Returns: The @i-th value. + * + * Since: 11.0.0 + */ +guint16 +garrow_half_float_array_get_value(GArrowHalfFloatArray *array, + gint64 i) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + return std::static_pointer_cast(arrow_array)->Value(i); +} + +/** + * garrow_half_float_array_get_values: + * @array: A #GArrowHalfFloatArray. + * @length: (out): The number of values. + * + * Returns: (array length=length): The raw values. + * + * Since: 11.0.0 + */ +const guint16 * +garrow_half_float_array_get_values(GArrowHalfFloatArray *array, + gint64 *length) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + return garrow_array_get_values_raw(arrow_array, length); +} + + G_DEFINE_TYPE(GArrowFloatArray, garrow_float_array, GARROW_TYPE_NUMERIC_ARRAY) @@ -3490,6 +3567,9 @@ garrow_array_new_raw_valist(std::shared_ptr *arrow_array, case arrow::Type::type::INT64: type = GARROW_TYPE_INT64_ARRAY; break; + case arrow::Type::type::HALF_FLOAT: + type = GARROW_TYPE_HALF_FLOAT_ARRAY; + break; case arrow::Type::type::FLOAT: type = GARROW_TYPE_FLOAT_ARRAY; break; diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index d8104ddb245..1a846c2320a 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -345,6 +345,35 @@ const guint64 *garrow_uint64_array_get_values(GArrowUInt64Array *array, gint64 *length); +#define GARROW_TYPE_HALF_FLOAT_ARRAY (garrow_half_float_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatArray, + garrow_half_float_array, + GARROW, + HALF_FLOAT_ARRAY, + GArrowNumericArray) +struct _GArrowHalfFloatArrayClass +{ + GArrowNumericArrayClass parent_class; +}; + + +GARROW_AVAILABLE_IN_11_0 +GArrowHalfFloatArray * +garrow_half_float_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +GARROW_AVAILABLE_IN_11_0 +guint16 +garrow_half_float_array_get_value(GArrowHalfFloatArray *array, + gint64 i); +GARROW_AVAILABLE_IN_11_0 +const guint16* +garrow_half_float_array_get_values(GArrowHalfFloatArray *array, + gint64 *length); + + #define GARROW_TYPE_FLOAT_ARRAY (garrow_float_array_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowFloatArray, garrow_float_array, diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index a94759f79c9..7e4841032fd 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -58,6 +58,9 @@ G_BEGIN_DECLS * * #GArrowUInt64DataType is a class for the 64-bit unsigned integer data type. * + * #GArrowHalfFloatDataType is a class for the 16-bit floating point + * data type. + * * #GArrowFloatDataType is a class for the 32-bit floating point data * type. * @@ -741,6 +744,39 @@ garrow_floating_point_data_type_class_init(GArrowFloatingPointDataTypeClass *kla } +G_DEFINE_TYPE(GArrowHalfFloatDataType, + garrow_half_float_data_type, + GARROW_TYPE_FLOATING_POINT_DATA_TYPE) + +static void +garrow_half_float_data_type_init(GArrowHalfFloatDataType *object) +{ +} + +static void +garrow_half_float_data_type_class_init(GArrowHalfFloatDataTypeClass *klass) +{ +} + +/** + * garrow_half_float_data_type_new: + * + * Returns: The newly created half float data type. + * + * Since: 11.0.0 + */ +GArrowHalfFloatDataType * +garrow_half_float_data_type_new(void) +{ + auto arrow_data_type = arrow::float16(); + auto data_type = + GARROW_HALF_FLOAT_DATA_TYPE(g_object_new(GARROW_TYPE_HALF_FLOAT_DATA_TYPE, + "data-type", &arrow_data_type, + NULL)); + return data_type; +} + + G_DEFINE_TYPE(GArrowFloatDataType, garrow_float_data_type, GARROW_TYPE_FLOATING_POINT_DATA_TYPE) @@ -2114,6 +2150,9 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::INT64: type = GARROW_TYPE_INT64_DATA_TYPE; break; + case arrow::Type::type::HALF_FLOAT: + type = GARROW_TYPE_HALF_FLOAT_DATA_TYPE; + break; case arrow::Type::type::FLOAT: type = GARROW_TYPE_FLOAT_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index 82fe251d31d..affbfcf13c2 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -254,6 +254,21 @@ struct _GArrowFloatingPointDataTypeClass }; +#define GARROW_TYPE_HALF_FLOAT_DATA_TYPE (garrow_half_float_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatDataType, + garrow_half_float_data_type, + GARROW, + HALF_FLOAT_DATA_TYPE, + GArrowFloatingPointDataType) +struct _GArrowHalfFloatDataTypeClass +{ + GArrowFloatingPointDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GArrowHalfFloatDataType *garrow_half_float_data_type_new(void); + + #define GARROW_TYPE_FLOAT_DATA_TYPE (garrow_float_data_type_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowFloatDataType, garrow_float_data_type, diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index 51ca416938a..6e627239728 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -35,13 +35,14 @@ G_BEGIN_DECLS * makes a list of #GArrowArrays one logical large array. */ -typedef struct GArrowChunkedArrayPrivate_ { +struct GArrowChunkedArrayPrivate { std::shared_ptr chunked_array; -} GArrowChunkedArrayPrivate; + GArrowDataType *data_type; +}; enum { - PROP_0, - PROP_CHUNKED_ARRAY + PROP_CHUNKED_ARRAY = 1, + PROP_DATA_TYPE, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowChunkedArray, @@ -53,6 +54,19 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowChunkedArray, garrow_chunked_array_get_instance_private( \ GARROW_CHUNKED_ARRAY(obj))) +static void +garrow_chunked_array_dispose(GObject *object) +{ + auto priv = GARROW_CHUNKED_ARRAY_GET_PRIVATE(object); + + if (priv->data_type) { + g_object_unref(priv->data_type); + priv->data_type = nullptr; + } + + G_OBJECT_CLASS(garrow_chunked_array_parent_class)->dispose(object); +} + static void garrow_chunked_array_finalize(GObject *object) { @@ -76,6 +90,9 @@ garrow_chunked_array_set_property(GObject *object, priv->chunked_array = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_DATA_TYPE: + priv->data_type = GARROW_DATA_TYPE(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -110,6 +127,7 @@ garrow_chunked_array_class_init(GArrowChunkedArrayClass *klass) gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_chunked_array_dispose; gobject_class->finalize = garrow_chunked_array_finalize; gobject_class->set_property = garrow_chunked_array_set_property; gobject_class->get_property = garrow_chunked_array_get_property; @@ -120,16 +138,26 @@ garrow_chunked_array_class_init(GArrowChunkedArrayClass *klass) static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_CHUNKED_ARRAY, spec); + + spec = g_param_spec_object("data-type", + "Data type", + "The data type of this chunked array", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATA_TYPE, spec); } /** * garrow_chunked_array_new: * @chunks: (element-type GArrowArray): The array chunks. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: A newly created #GArrowChunkedArray. + * Returns: (nullable): + * A newly created #GArrowChunkedArray or %NULL on error. */ GArrowChunkedArray * -garrow_chunked_array_new(GList *chunks) +garrow_chunked_array_new(GList *chunks, GError **error) { std::vector> arrow_chunks; for (GList *node = chunks; node; node = node->next) { @@ -137,9 +165,37 @@ garrow_chunked_array_new(GList *chunks) arrow_chunks.push_back(garrow_array_get_raw(chunk)); } - auto arrow_chunked_array = - std::make_shared(arrow_chunks); - return garrow_chunked_array_new_raw(&arrow_chunked_array); + auto arrow_chunked_array_result = arrow::ChunkedArray::Make(arrow_chunks); + if (garrow::check(error, arrow_chunked_array_result, "[chunked-array][new]")) { + auto arrow_chunked_array = *arrow_chunked_array_result; + return garrow_chunked_array_new_raw(&arrow_chunked_array); + } else { + return nullptr; + } +} + +/** + * garrow_chunked_array_new_empty: + * @data_type: The #GArrowDataType of this chunked array. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): + * A newly created empty #GArrowChunkedArray or %NULL on error. + * + * Since: 11.0.0 + */ +GArrowChunkedArray * +garrow_chunked_array_new_empty(GArrowDataType *data_type, GError **error) +{ + auto arrow_data_type = garrow_data_type_get_raw(data_type); + auto arrow_chunked_array_result = + arrow::ChunkedArray::MakeEmpty(arrow_data_type); + if (garrow::check(error, arrow_chunked_array_result, "[chunked-array][new]")) { + auto arrow_chunked_array = *arrow_chunked_array_result; + return garrow_chunked_array_new_raw(&arrow_chunked_array); + } else { + return nullptr; + } } /** @@ -174,9 +230,14 @@ garrow_chunked_array_equal(GArrowChunkedArray *chunked_array, GArrowDataType * garrow_chunked_array_get_value_data_type(GArrowChunkedArray *chunked_array) { - auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); - auto arrow_type = arrow_chunked_array->type(); - return garrow_data_type_new_raw(&arrow_type); + auto priv = GARROW_CHUNKED_ARRAY_GET_PRIVATE(chunked_array); + if (!priv->data_type) { + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + auto arrow_type = arrow_chunked_array->type(); + priv->data_type = garrow_data_type_new_raw(&arrow_type); + } + g_object_ref(priv->data_type); + return priv->data_type; } /** @@ -353,11 +414,21 @@ garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, GError **error) G_END_DECLS GArrowChunkedArray * -garrow_chunked_array_new_raw(std::shared_ptr *arrow_chunked_array) +garrow_chunked_array_new_raw( + std::shared_ptr *arrow_chunked_array) +{ + return garrow_chunked_array_new_raw(arrow_chunked_array, nullptr); +} + +GArrowChunkedArray * +garrow_chunked_array_new_raw( + std::shared_ptr *arrow_chunked_array, + GArrowDataType *data_type) { auto chunked_array = GARROW_CHUNKED_ARRAY(g_object_new(GARROW_TYPE_CHUNKED_ARRAY, "chunked-array", arrow_chunked_array, + "data-type", data_type, NULL)); return chunked_array; } diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index 528be28ad3e..e8a2df931f4 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -24,7 +24,13 @@ G_BEGIN_DECLS -GArrowChunkedArray *garrow_chunked_array_new(GList *chunks); +GArrowChunkedArray * +garrow_chunked_array_new(GList *chunks, + GError **error); +GARROW_AVAILABLE_IN_11_0 +GArrowChunkedArray * +garrow_chunked_array_new_empty(GArrowDataType *data_type, + GError **error); gboolean garrow_chunked_array_equal(GArrowChunkedArray *chunked_array, GArrowChunkedArray *other_chunked_array); diff --git a/c_glib/arrow-glib/chunked-array.hpp b/c_glib/arrow-glib/chunked-array.hpp index ec5068adc07..06802366ec1 100644 --- a/c_glib/arrow-glib/chunked-array.hpp +++ b/c_glib/arrow-glib/chunked-array.hpp @@ -23,5 +23,12 @@ #include -GArrowChunkedArray *garrow_chunked_array_new_raw(std::shared_ptr *arrow_chunked_array); -std::shared_ptr garrow_chunked_array_get_raw(GArrowChunkedArray *chunked_array); +GArrowChunkedArray * +garrow_chunked_array_new_raw( + std::shared_ptr *arrow_chunked_array); +GArrowChunkedArray * +garrow_chunked_array_new_raw( + std::shared_ptr *arrow_chunked_array, + GArrowDataType *data_type); +std::shared_ptr +garrow_chunked_array_get_raw(GArrowChunkedArray *chunked_array); diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index f3a29be5e43..27e49b0027d 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -109,7 +110,6 @@ namespace { return (sort_key.target == other_sort_key.target) && (sort_key.order == other_sort_key.order); - } } @@ -136,6 +136,8 @@ G_BEGIN_DECLS * * #GArrowSourceNodeOptions is a class to customize a source node. * + * #GArrowProjectNodeOptions is a class to customize a project node. + * * #GArrowAggregation is a class to specify how to aggregate. * * #GArrowAggregateNodeOptions is a class to customize an aggregate node. @@ -938,7 +940,7 @@ garrow_source_node_options_new_record_batch_reader( arrow_reader->schema(), [arrow_reader]() { using ExecBatch = arrow::compute::ExecBatch; - using ExecBatchOptional = arrow::util::optional; + using ExecBatchOptional = std::optional; auto arrow_record_batch_result = arrow_reader->Next(); if (!arrow_record_batch_result.ok()) { return arrow::AsyncGeneratorEnd(); @@ -979,7 +981,7 @@ garrow_source_node_options_new_record_batch(GArrowRecordBatch *record_batch) state->record_batch->schema(), [state]() { using ExecBatch = arrow::compute::ExecBatch; - using ExecBatchOptional = arrow::util::optional; + using ExecBatchOptional = std::optional; if (!state->generated) { state->generated = true; return arrow::Future::MakeFinished( @@ -1014,6 +1016,61 @@ garrow_source_node_options_new_table(GArrowTable *table) } +G_DEFINE_TYPE(GArrowProjectNodeOptions, + garrow_project_node_options, + GARROW_TYPE_EXECUTE_NODE_OPTIONS) + +static void +garrow_project_node_options_init(GArrowProjectNodeOptions *object) +{ +} + +static void +garrow_project_node_options_class_init(GArrowProjectNodeOptionsClass *klass) +{ +} + +/** + * garrow_project_node_options_new: + * @expressions: (element-type GArrowExpression): + * A list of #GArrowExpression to be executed. + * @names: (nullable) (array length=n_names): + * A list of output column names of @expressions. If @names is %NULL, + * the string representations of @expressions will be used. + * @n_names: The number of @names. + * + * Returns: A newly created #GArrowProjectNodeOptions. + * + * Since: 11.0.0 + */ +GArrowProjectNodeOptions * +garrow_project_node_options_new(GList *expressions, + gchar **names, + gsize n_names) +{ + std::vector arrow_expressions; + std::vector arrow_names; + for (auto node = expressions; node; node = g_list_next(node)) { + auto expression = GARROW_EXPRESSION(node->data); + arrow_expressions.push_back(*garrow_expression_get_raw(expression)); + } + for (gsize i = 0; i < n_names; ++i) { + arrow_names.emplace_back(names[i]); + } + if (!arrow_names.empty()) { + for (size_t i = arrow_names.size(); i < arrow_expressions.size(); ++i) { + arrow_names.push_back(arrow_expressions[i].ToString()); + } + } + auto arrow_options = + new arrow::compute::ProjectNodeOptions(arrow_expressions, arrow_names); + auto options = g_object_new(GARROW_TYPE_PROJECT_NODE_OPTIONS, + "options", arrow_options, + NULL); + return GARROW_PROJECT_NODE_OPTIONS(options); +} + + typedef struct GArrowAggregationPrivate_ { gchar *function; GArrowFunctionOptions *options; @@ -1296,7 +1353,7 @@ garrow_aggregate_node_options_new(GList *aggregations, typedef struct GArrowSinkNodeOptionsPrivate_ { - arrow::AsyncGenerator> generator; + arrow::AsyncGenerator> generator; GArrowRecordBatchReader *reader; } GArrowSinkNodeOptionsPrivate; @@ -1333,7 +1390,7 @@ garrow_sink_node_options_init(GArrowSinkNodeOptions *object) { auto priv = GARROW_SINK_NODE_OPTIONS_GET_PRIVATE(object); new(&(priv->generator)) - arrow::AsyncGenerator>(); + arrow::AsyncGenerator>(); } static void @@ -1771,6 +1828,39 @@ garrow_execute_plan_build_source_node(GArrowExecutePlan *plan, error); } +/** + * garrow_execute_plan_build_project_node: + * @plan: A #GArrowExecutePlan. + * @input: A #GArrowExecuteNode. + * @options: A #GArrowProjectNodeOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * This is a shortcut of garrow_execute_plan_build_node() for project + * node. + * + * Returns: (transfer full): A newly built and added #GArrowExecuteNode + * for project on success, %NULL on error. + * + * Since: 11.0.0 + */ +GArrowExecuteNode * +garrow_execute_plan_build_project_node(GArrowExecutePlan *plan, + GArrowExecuteNode *input, + GArrowProjectNodeOptions *options, + GError **error) +{ + GList *inputs = nullptr; + inputs = g_list_prepend(inputs, input); + auto node = + garrow_execute_plan_build_node(plan, + "project", + inputs, + GARROW_EXECUTE_NODE_OPTIONS(options), + error); + g_list_free(inputs); + return node; +} + /** * garrow_execute_plan_build_aggregate_node: * @plan: A #GArrowExecutePlan. @@ -1931,16 +2021,21 @@ garrow_execute_plan_stop(GArrowExecutePlan *plan) /** * garrow_execute_plan_wait: * @plan: A #GArrowExecutePlan. + * @error: (nullable): Return location for a #GError or %NULL. * * Waits for finishing this plan. * + * Returns: %TRUE on success, %FALSE on error. + * * Since: 6.0.0 */ -void -garrow_execute_plan_wait(GArrowExecutePlan *plan) +gboolean +garrow_execute_plan_wait(GArrowExecutePlan *plan, GError **error) { auto arrow_plan = garrow_execute_plan_get_raw(plan); arrow_plan->finished().Wait(); + return garrow::check(error, arrow_plan->finished().status(), + "[execute-plan][wait]"); } @@ -5121,7 +5216,7 @@ GArrowFunctionOptions * garrow_function_options_new_raw( const arrow::compute::FunctionOptions *arrow_options) { - arrow::util::string_view arrow_type_name(arrow_options->type_name()); + std::string_view arrow_type_name(arrow_options->type_name()); if (arrow_type_name == "CastOptions") { auto arrow_cast_options = static_cast(arrow_options); diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index a9ba6c2af94..1ac1d05258c 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -156,6 +156,24 @@ GArrowSourceNodeOptions * garrow_source_node_options_new_table(GArrowTable *table); +#define GARROW_TYPE_PROJECT_NODE_OPTIONS (garrow_project_node_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowProjectNodeOptions, + garrow_project_node_options, + GARROW, + PROJECT_NODE_OPTIONS, + GArrowExecuteNodeOptions) +struct _GArrowProjectNodeOptionsClass +{ + GArrowExecuteNodeOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GArrowProjectNodeOptions * +garrow_project_node_options_new(GList *expressions, + gchar **names, + gsize n_names); + + #define GARROW_TYPE_AGGREGATION (garrow_aggregation_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowAggregation, garrow_aggregation, @@ -321,6 +339,12 @@ GArrowExecuteNode * garrow_execute_plan_build_source_node(GArrowExecutePlan *plan, GArrowSourceNodeOptions *options, GError **error); +GARROW_AVAILABLE_IN_11_0 +GArrowExecuteNode * +garrow_execute_plan_build_project_node(GArrowExecutePlan *plan, + GArrowExecuteNode *input, + GArrowProjectNodeOptions *options, + GError **error); GARROW_AVAILABLE_IN_6_0 GArrowExecuteNode * garrow_execute_plan_build_aggregate_node(GArrowExecutePlan *plan, @@ -352,8 +376,9 @@ GARROW_AVAILABLE_IN_6_0 void garrow_execute_plan_stop(GArrowExecutePlan *plan); GARROW_AVAILABLE_IN_6_0 -void -garrow_execute_plan_wait(GArrowExecutePlan *plan); +gboolean +garrow_execute_plan_wait(GArrowExecutePlan *plan, + GError **error); GArrowCastOptions *garrow_cast_options_new(void); diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index e1e46c7df10..844c83d629b 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -34,6 +33,7 @@ #include #include +#include G_BEGIN_DECLS @@ -855,7 +855,7 @@ namespace garrow { } } - arrow::Result Peek(int64_t nbytes) override { + arrow::Result Peek(int64_t nbytes) override { if (!G_IS_BUFFERED_INPUT_STREAM(input_stream_)) { std::string message("[gio-input-stream][peek] " "not peekable input stream: <"); @@ -882,8 +882,7 @@ namespace garrow { if (data_size > static_cast(nbytes)) { data_size = nbytes; } - return arrow::util::string_view(static_cast(data), - data_size); + return std::string_view(static_cast(data), data_size); } arrow::Status Seek(int64_t position) override { diff --git a/c_glib/arrow-glib/scalar.cpp b/c_glib/arrow-glib/scalar.cpp index f8699f34eea..24f9b2caad5 100644 --- a/c_glib/arrow-glib/scalar.cpp +++ b/c_glib/arrow-glib/scalar.cpp @@ -57,6 +57,8 @@ G_BEGIN_DECLS * * #GArrowUInt64Scalar is a class for a 64-bit unsigned integer scalar. * + * #GArrowHalfFloatScalar is a class for a 16-bit floating point scalar. + * * #GArrowFloatScalar is a class for a 32-bit floating point scalar. * * #GArrowDoubleScalar is a class for a 64-bit floating point scalar. @@ -250,9 +252,8 @@ garrow_scalar_parse(GArrowDataType *data_type, GError **error) { const auto arrow_data_type = garrow_data_type_get_raw(data_type); - auto arrow_data = - arrow::util::string_view(reinterpret_cast(data), - size); + auto arrow_data = std::string_view(reinterpret_cast(data), + size); auto arrow_scalar_result = arrow::Scalar::Parse(arrow_data_type, arrow_data); if (garrow::check(error, arrow_scalar_result, "[scalar][parse]")) { auto arrow_scalar = *arrow_scalar_result; @@ -868,6 +869,55 @@ garrow_uint64_scalar_get_value(GArrowUInt64Scalar *scalar) } +G_DEFINE_TYPE(GArrowHalfFloatScalar, + garrow_half_float_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_half_float_scalar_init(GArrowHalfFloatScalar *object) +{ +} + +static void +garrow_half_float_scalar_class_init(GArrowHalfFloatScalarClass *klass) +{ +} + +/** + * garrow_half_float_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowHalfFloatScalar. + * + * Since: 11.0.0 + */ +GArrowHalfFloatScalar * +garrow_half_float_scalar_new(guint16 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_HALF_FLOAT_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_half_float_scalar_get_value: + * @scalar: A #GArrowHalfFloatScalar. + * + * Returns: The value of this scalar. + * + * Since: 11.0.0 + */ +guint16 +garrow_half_float_scalar_get_value(GArrowHalfFloatScalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + G_DEFINE_TYPE(GArrowFloatScalar, garrow_float_scalar, GARROW_TYPE_SCALAR) @@ -2552,6 +2602,9 @@ garrow_scalar_new_raw_valist(std::shared_ptr *arrow_scalar, case arrow::Type::type::UINT64: type = GARROW_TYPE_UINT64_SCALAR; break; + case arrow::Type::type::HALF_FLOAT: + type = GARROW_TYPE_HALF_FLOAT_SCALAR; + break; case arrow::Type::type::FLOAT: type = GARROW_TYPE_FLOAT_SCALAR; break; diff --git a/c_glib/arrow-glib/scalar.h b/c_glib/arrow-glib/scalar.h index 3fa00597ca1..f90160e35e0 100644 --- a/c_glib/arrow-glib/scalar.h +++ b/c_glib/arrow-glib/scalar.h @@ -256,6 +256,25 @@ guint64 garrow_uint64_scalar_get_value(GArrowUInt64Scalar *scalar); +#define GARROW_TYPE_HALF_FLOAT_SCALAR (garrow_half_float_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowHalfFloatScalar, + garrow_half_float_scalar, + GARROW, + HALF_FLOAT_SCALAR, + GArrowScalar) +struct _GArrowHalfFloatScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_11_0 +GArrowHalfFloatScalar * +garrow_half_float_scalar_new(guint16 value); +GARROW_AVAILABLE_IN_11_0 +guint16 +garrow_half_float_scalar_get_value(GArrowHalfFloatScalar *scalar); + + #define GARROW_TYPE_FLOAT_SCALAR (garrow_float_scalar_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowFloatScalar, garrow_float_scalar, diff --git a/c_glib/arrow-glib/tensor.cpp b/c_glib/arrow-glib/tensor.cpp index 7e6dc80f5dd..ddbf1189b91 100644 --- a/c_glib/arrow-glib/tensor.cpp +++ b/c_glib/arrow-glib/tensor.cpp @@ -162,7 +162,7 @@ garrow_tensor_class_init(GArrowTensorClass *klass) * @n_strides: The number of strides. * @dimension_names: (array length=n_dimension_names) (nullable): A list of * dimension names. - * @n_dimension_names: The number of dimension names + * @n_dimension_names: A list of dimension names * * Returns: The newly created #GArrowTensor. * diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 74c54b998d6..bd67ed6b8b9 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,24 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_11_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 11.0.0 + */ +#define GARROW_VERSION_11_0 G_ENCODE_VERSION(11, 0) + +/** + * GARROW_VERSION_10_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 10.0.0 + */ +#define GARROW_VERSION_10_0 G_ENCODE_VERSION(10, 0) + /** * GARROW_VERSION_9_0: * @@ -301,6 +319,34 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_11_0 +# define GARROW_DEPRECATED_IN_11_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_11_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_11_0 +# define GARROW_DEPRECATED_IN_11_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_11_0 +# define GARROW_AVAILABLE_IN_11_0 GARROW_UNAVAILABLE(11, 0) +#else +# define GARROW_AVAILABLE_IN_11_0 +#endif + +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_10_0 +# define GARROW_DEPRECATED_IN_10_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_10_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_10_0 +# define GARROW_DEPRECATED_IN_10_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_10_0 +# define GARROW_AVAILABLE_IN_10_0 GARROW_UNAVAILABLE(10, 0) +#else +# define GARROW_AVAILABLE_IN_10_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_9_0 # define GARROW_DEPRECATED_IN_9_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_9_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index b13195b0703..e6066379ceb 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -68,6 +68,10 @@ Index of deprecated API + + Index of new symbols in 11.0.0 + + Index of new symbols in 6.0.0 diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 2ad1135bc69..e6990af5593 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -193,6 +193,14 @@ Index of deprecated API + + Index of new symbols in 11.0.0 + + + + Index of new symbols in 10.0.0 + + Index of new symbols in 9.0.0 diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index d42d4801b7e..1ced7754a70 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -29,7 +29,7 @@ ggandiva_literal_node_get(GGandivaLiteralNode *node) { auto gandiva_literal_node = std::static_pointer_cast(ggandiva_node_get_raw(GGANDIVA_NODE(node))); - return arrow::util::get(gandiva_literal_node->holder()); + return std::get(gandiva_literal_node->holder()); } G_BEGIN_DECLS diff --git a/c_glib/meson.build b/c_glib/meson.build index 85d3a75d423..57a0e74d95e 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -21,10 +21,10 @@ project('arrow-glib', 'c', 'cpp', license: 'Apache-2.0', default_options: [ 'c_std=c99', - 'cpp_std=c++11', + 'cpp_std=c++17', ]) -version = '10.0.0-SNAPSHOT' +version = '11.0.0' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index c53bb94cebd..537e8330532 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -422,25 +422,24 @@ gparquet_arrow_file_writer_new_arrow(GArrowSchema *schema, auto arrow_output_stream = garrow_output_stream_get_raw(sink); auto arrow_memory_pool = arrow::default_memory_pool(); std::unique_ptr parquet_arrow_file_writer; - arrow::Status status; + arrow::Result> maybe_writer; if (writer_properties) { auto parquet_writer_properties = gparquet_writer_properties_get_raw(writer_properties); - status = parquet::arrow::FileWriter::Open(*arrow_schema, - arrow_memory_pool, - arrow_output_stream, - parquet_writer_properties, - &parquet_arrow_file_writer); + maybe_writer = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties); } else { auto parquet_writer_properties = parquet::default_writer_properties(); - status = parquet::arrow::FileWriter::Open(*arrow_schema, - arrow_memory_pool, - arrow_output_stream, - parquet_writer_properties, - &parquet_arrow_file_writer); + maybe_writer = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties); } - if (garrow_error_check(error, - status, - "[parquet][arrow][file-writer][new-arrow]")) { + if (garrow::check(error, + maybe_writer, + "[parquet][arrow][file-writer][new-arrow]")) { + parquet_arrow_file_writer = std::move(*maybe_writer); return gparquet_arrow_file_writer_new_raw(parquet_arrow_file_writer.release()); } else { return NULL; @@ -477,25 +476,24 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, arrow_file_output_stream.ValueOrDie(); auto arrow_memory_pool = arrow::default_memory_pool(); std::unique_ptr parquet_arrow_file_writer; - arrow::Status status; + arrow::Result> maybe_writer; if (writer_properties) { auto parquet_writer_properties = gparquet_writer_properties_get_raw(writer_properties); - status = parquet::arrow::FileWriter::Open(*arrow_schema, - arrow_memory_pool, - arrow_output_stream, - parquet_writer_properties, - &parquet_arrow_file_writer); + maybe_writer = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties); } else { auto parquet_writer_properties = parquet::default_writer_properties(); - status = parquet::arrow::FileWriter::Open(*arrow_schema, - arrow_memory_pool, - arrow_output_stream, - parquet_writer_properties, - &parquet_arrow_file_writer); + maybe_writer = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties); } if (garrow::check(error, - status, + maybe_writer, "[parquet][arrow][file-writer][new-path]")) { + parquet_arrow_file_writer = std::move(*maybe_writer); return gparquet_arrow_file_writer_new_raw(parquet_arrow_file_writer.release()); } else { return NULL; diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index 26476f4d6b5..1cc3b6a80a6 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -35,6 +35,9 @@ G_BEGIN_DECLS * @title: Client related classes * @include: plasma-glib/plasma-glib.h * + * Apache Arrow Plasma C GLib is deprecated since 10.0.0. This will be + * removed from 12.0.0 or so. + * * #GPlasmaClientOptions is a class for customizing plasma store * connection. * diff --git a/c_glib/plasma-glib/meson.build b/c_glib/plasma-glib/meson.build index cf811d42b72..50b0be0b31d 100644 --- a/c_glib/plasma-glib/meson.build +++ b/c_glib/plasma-glib/meson.build @@ -17,6 +17,9 @@ # specific language governing permissions and limitations # under the License. +warning('Apache Arrow Plasma C GLib is deprecated since 10.0.0. ' + + 'This will be removed from 12.0.0 or so.') + project_name = 'plasma-glib' sources = files( @@ -46,6 +49,7 @@ dependencies = [ ] cpp_args = [ '-DG_LOG_DOMAIN="Plasma"', + '-D_PLASMA_NO_DEPRECATE', ] pkg_config_requires = [ 'plasma', diff --git a/c_glib/plasma-glib/object.cpp b/c_glib/plasma-glib/object.cpp index 121afb1cf84..8bf0d4b0772 100644 --- a/c_glib/plasma-glib/object.cpp +++ b/c_glib/plasma-glib/object.cpp @@ -30,6 +30,9 @@ G_BEGIN_DECLS * @title: Object related classes * @include: plasma-glib/plasma-glib.h * + * Apache Arrow Plasma C GLib is deprecated since 10.0.0. This will be + * removed from 12.0.0 or so. + * * #GPlasmaObjectID is a class for an object ID. * * #GPlasmaObject is a base class for an object stored in plasma store. diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb index bca9e72418c..30944ccd3bb 100644 --- a/c_glib/test/dataset/test-file-system-dataset-factory.rb +++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb @@ -70,4 +70,51 @@ def test_directory assert_equal(@table1.concatenate([@table2]), dataset.to_table) end + + sub_test_case("#finish") do + def setup + super do + @factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + @factory.file_system_uri = build_file_uri(@path1) + yield + end + end + + def test_schema + options = ArrowDataset::FinishOptions.new + options.schema = build_schema(visible: Arrow::BooleanDataType.new, + point: Arrow::Int16DataType.new) + dataset = @factory.finish(options) + assert_equal(build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int16_array([1, 2, 3]), + build_int16_array([-1, -2, -3, -4]), + ]), + dataset.to_table) + end + + def test_inspect_n_fragments + options = ArrowDataset::FinishOptions.new + options.inspect_n_fragments = -1 + dataset = @factory.finish(options) + assert_equal(@table1, dataset.to_table) + end + + def test_validate_fragments + options = ArrowDataset::FinishOptions.new + options.schema = build_schema(visible: Arrow::BooleanDataType.new, + point: Arrow::Int16DataType.new) + options.validate_fragments = true + message = "[file-system-dataset-factory][finish]: " + + "Invalid: Unable to merge: " + + "Field point has incompatible types: int16 vs int32" + error = assert_raise(Arrow::Error::Invalid) do + @factory.finish(options) + end + assert_equal(message, error.message.lines(chomp: true).first) + end + end end diff --git a/c_glib/test/dataset/test-partitioning-options.rb b/c_glib/test/dataset/test-partitioning-factory-options.rb similarity index 92% rename from c_glib/test/dataset/test-partitioning-options.rb rename to c_glib/test/dataset/test-partitioning-factory-options.rb index 9ff585aa7cf..7e751d77895 100644 --- a/c_glib/test/dataset/test-partitioning-options.rb +++ b/c_glib/test/dataset/test-partitioning-factory-options.rb @@ -15,12 +15,12 @@ # specific language governing permissions and limitations # under the License. -class TestDatasetPartitioningOptions < Test::Unit::TestCase +class TestDatasetPartitioningFactoryOptions < Test::Unit::TestCase include Helper::Buildable def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) - @options = ArrowDataset::PartitioningOptions.new + @options = ArrowDataset::PartitioningFactoryOptions.new end def test_infer_dictionary diff --git a/c_glib/test/dataset/test-partitioning.rb b/c_glib/test/dataset/test-partitioning.rb index 2b33b1eaaac..a74a9bb7273 100644 --- a/c_glib/test/dataset/test-partitioning.rb +++ b/c_glib/test/dataset/test-partitioning.rb @@ -23,7 +23,7 @@ def setup end def test_default - assert_equal("default", ArrowDataset::Partitioning.new.type_name) + assert_equal("default", ArrowDataset::DefaultPartitioning.new.type_name) end def test_directory @@ -31,4 +31,31 @@ def test_directory partitioning = ArrowDataset::DirectoryPartitioning.new(schema) assert_equal("directory", partitioning.type_name) end + + def test_directory_options + schema = build_schema(year: Arrow::UInt16DataType.new) + options = ArrowDataset::KeyValuePartitioningOptions.new + options.segment_encoding = :none + partitioning = ArrowDataset::DirectoryPartitioning.new(schema, + nil, + options) + assert_equal("directory", partitioning.type_name) + end + + def test_hive + schema = build_schema(year: Arrow::UInt16DataType.new) + partitioning = ArrowDataset::HivePartitioning.new(schema) + assert_equal("hive", partitioning.type_name) + end + + def test_hive_options + schema = build_schema(year: Arrow::UInt16DataType.new) + options = ArrowDataset::HivePartitioningOptions.new + options.segment_encoding = :none + options.null_fallback = "NULL" + partitioning = ArrowDataset::HivePartitioning.new(schema, + nil, + options) + assert_equal("NULL", partitioning.null_fallback) + end end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 3a1240cfa1f..29d7b6ba03b 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -72,6 +72,10 @@ def build_uint64_array(values) build_array(Arrow::UInt64ArrayBuilder.new, values) end + def build_half_float_array(values) + build_array(Arrow::HalfFloatArrayBuilder.new, values) + end + def build_float_array(values) build_array(Arrow::FloatArrayBuilder.new, values) end diff --git a/c_glib/test/test-chunked-array.rb b/c_glib/test/test-chunked-array.rb index 8f912ac846b..86bd23af6f5 100644 --- a/c_glib/test/test-chunked-array.rb +++ b/c_glib/test/test-chunked-array.rb @@ -18,6 +18,12 @@ class TestChunkedArray < Test::Unit::TestCase include Helper::Buildable + def test_empty + chunked_array = Arrow::ChunkedArray.new(Arrow::BooleanDataType.new) + assert_equal(Arrow::BooleanDataType.new, + chunked_array.value_data_type) + end + def test_equal chunks1 = [ build_boolean_array([true, false]), diff --git a/c_glib/test/test-half-float-array.rb b/c_glib/test/test-half-float-array.rb new file mode 100644 index 00000000000..776efe631a9 --- /dev/null +++ b/c_glib/test/test-half-float-array.rb @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestHalfFloatArray < Test::Unit::TestCase + include Helper::Buildable + include Helper::Omittable + + def setup + @one = 0x3c00 + @zero = 0x0000 + @positive_infinity = 0x8c00 + end + + def test_new + values = [@one, @zero, @positive_infinity, nil] + data = values[0..-2].pack("S*") + null_bitmap = [0b0111].pack("C*") + assert_equal(build_half_float_array(values), + Arrow::HalfFloatArray.new(4, + Arrow::Buffer.new(data), + Arrow::Buffer.new(null_bitmap), + -1)) + end + + def test_buffer + builder = Arrow::HalfFloatArrayBuilder.new + builder.append_value(@one) + builder.append_value(@zero) + builder.append_value(@positive_infinity) + array = builder.finish + assert_equal([@one, @zero, @positive_infinity].pack("S*"), + array.buffer.data.to_s) + end + + def test_value + builder = Arrow::HalfFloatArrayBuilder.new + builder.append_value(@one) + array = builder.finish + assert_in_delta(@one, array.get_value(0)) + end + + def test_values + require_gi_bindings(3, 1, 7) + builder = Arrow::HalfFloatArrayBuilder.new + builder.append_value(@one) + builder.append_value(@zero) + builder.append_value(@positive_infinity) + array = builder.finish + assert_equal([@one, @zero, @positive_infinity], + array.values) + end +end diff --git a/c_glib/test/test-half-float-data-type.rb b/c_glib/test/test-half-float-data-type.rb new file mode 100644 index 00000000000..8656fb9309a --- /dev/null +++ b/c_glib/test/test-half-float-data-type.rb @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestHalfFloatDataType < Test::Unit::TestCase + def test_type + data_type = Arrow::HalfFloatDataType.new + assert_equal(Arrow::Type::HALF_FLOAT, data_type.id) + end + + def test_name + data_type = Arrow::HalfFloatDataType.new + assert_equal("halffloat", data_type.name) + end + + def test_to_s + data_type = Arrow::HalfFloatDataType.new + assert_equal("halffloat", data_type.to_s) + end +end diff --git a/cpp/src/arrow/python/ArrowPythonConfig.cmake.in b/c_glib/test/test-half-float-scalar.rb similarity index 53% rename from cpp/src/arrow/python/ArrowPythonConfig.cmake.in rename to c_glib/test/test-half-float-scalar.rb index 4cae0c2df5c..ac41f91ece6 100644 --- a/cpp/src/arrow/python/ArrowPythonConfig.cmake.in +++ b/c_glib/test/test-half-float-scalar.rb @@ -14,23 +14,37 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# -# This config sets the following variables in your project:: -# -# ArrowPython_FOUND - true if Arrow Python found on the system -# -# This config sets the following targets in your project:: -# -# arrow_python_shared - for linked as shared library if shared library is built -# arrow_python_static - for linked as static library if static library is built -@PACKAGE_INIT@ +class TestHalfFloatScalar < Test::Unit::TestCase + def setup + @half_float = 0x3c01 # 1.0009765625 + @scalar = Arrow::HalfFloatScalar.new(@half_float) + end + + def test_data_type + assert_equal(Arrow::HalfFloatDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + options = Arrow::EqualOptions.new + options.approx = true + assert do + @scalar.equal_options(Arrow::HalfFloatScalar.new(@half_float), options) + end + end -include(CMakeFindDependencyMacro) -find_dependency(Arrow) + def test_to_s + assert_equal("[\n #{@half_float}\n]", @scalar.to_s) + end -# Load targets only once. If we load targets multiple times, CMake reports -# already existent target error. -if(NOT (TARGET arrow_python_shared OR TARGET arrow_python_static)) - include("${CMAKE_CURRENT_LIST_DIR}/ArrowPythonTargets.cmake") -endif() + def test_value + assert_in_delta(@half_float, @scalar.value) + end +end diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb index 38900cf12f3..6626c67c3ab 100644 --- a/c_glib/test/test-orc-file-reader.rb +++ b/c_glib/test/test-orc-file-reader.rb @@ -185,8 +185,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - assert_equal(build_table("boolean1" => build_boolean_array([false, true]), - "short1" => build_int16_array([1024, 2048])), + assert_equal(build_table("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripes) end end @@ -200,10 +200,8 @@ def all_columns test("select fields") do require_gi_bindings(3, 2, 6) @reader.field_indices = [1, 3] - boolean1 = build_boolean_array([false, true]) - short1 = build_int16_array([1024, 2048]) - assert_equal(build_record_batch("boolean1" => boolean1, - "short1" => short1), + assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]), + "int1" => build_int32_array([65536, 65536])), @reader.read_stripe(0)) end end diff --git a/c_glib/test/test-project-node.rb b/c_glib/test/test-project-node.rb new file mode 100644 index 00000000000..758f2254530 --- /dev/null +++ b/c_glib/test/test-project-node.rb @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestProjectNode < Test::Unit::TestCase + include Helper::Buildable + + def execute_plan(options) + plan = Arrow::ExecutePlan.new + numbers = build_int8_array([1, 2, 3, 4, 5]) + strings = build_string_array(["a", "b", "a", "b", "a"]) + table = build_table(number: numbers, + string: strings) + source_node_options = Arrow::SourceNodeOptions.new(table) + source_node = plan.build_source_node(source_node_options) + project_node = plan.build_project_node(source_node, options) + sink_node_options = Arrow::SinkNodeOptions.new + sink_node = plan.build_sink_node(project_node, + sink_node_options) + plan.validate + plan.start + plan.wait + reader = sink_node_options.get_reader(project_node.output_schema) + table = reader.read_all + plan.stop + table + end + + def test_expressions + three_scalar = Arrow::Int8Scalar.new(3) + three_datum = Arrow::ScalarDatum.new(three_scalar) + expressions = [ + Arrow::FieldExpression.new("number"), + Arrow::CallExpression.new("multiply", + [ + Arrow::FieldExpression.new("number"), + Arrow::LiteralExpression.new(three_datum), + ]), + ] + options = Arrow::ProjectNodeOptions.new(expressions) + assert_equal(build_table("number" => [ + build_int8_array([1, 2, 3, 4, 5]), + ], + "multiply(number, 3)" => [ + build_int8_array([3, 6, 9, 12, 15]), + ]), + execute_plan(options)) + end + + def test_names + three_scalar = Arrow::Int8Scalar.new(3) + three_datum = Arrow::ScalarDatum.new(three_scalar) + expressions = [ + Arrow::CallExpression.new("multiply", + [ + Arrow::FieldExpression.new("number"), + Arrow::LiteralExpression.new(three_datum), + ]), + Arrow::FieldExpression.new("number"), + ] + options = Arrow::ProjectNodeOptions.new(expressions, ["number * 3"]) + assert_equal(build_table("number * 3" => [ + build_int8_array([3, 6, 9, 12, 15]), + ], + "number" => [ + build_int8_array([1, 2, 3, 4, 5]), + ]), + execute_plan(options)) + end +end diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index e2c2d800d0f..0faac67a14a 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -31,51 +31,13 @@ set ARROW_DEBUG_MEMORY_POOL=trap set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% -@rem -@rem In the configurations below we disable building the Arrow static library -@rem to save some time. Unfortunately this will still build the Parquet static -@rem library because of PARQUET-1420 (Thrift-generated symbols not exported in DLL). -@rem -if "%JOB%" == "Build_Debug" ( - mkdir cpp\build-debug - pushd cpp\build-debug - - cmake -G "%GENERATOR%" ^ - -DARROW_BOOST_USE_SHARED=OFF ^ - -DARROW_BUILD_EXAMPLES=ON ^ - -DARROW_BUILD_STATIC=OFF ^ - -DARROW_BUILD_TESTS=ON ^ - -DARROW_CXXFLAGS="/MP" ^ - -DARROW_ENABLE_TIMING_TESTS=OFF ^ - -DARROW_USE_PRECOMPILED_HEADERS=OFF ^ - -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ - -DCMAKE_BUILD_TYPE="Debug" ^ - -DCMAKE_UNITY_BUILD=ON ^ - .. || exit /B - - cmake --build . --config Debug || exit /B - ctest --output-on-failure || exit /B - popd - - @rem Finish Debug build successfully - exit /B 0 -) call activate arrow -@rem Use Boost from Anaconda -set BOOST_ROOT=%CONDA_PREFIX%\Library -set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib - @rem The "main" C++ build script for Windows CI @rem (i.e. for usual configurations) -if "%JOB%" == "Toolchain" ( - set CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON -) else ( - @rem We're in a conda environment but don't want to use it for the dependencies - set CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=AUTO -) +set CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON @rem Enable warnings-as-errors set ARROW_CXXFLAGS=/WX /MP @@ -98,16 +60,20 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_COMPUTE=ON ^ -DARROW_CSV=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DARROW_DATASET=ON ^ -DARROW_ENABLE_TIMING_TESTS=OFF ^ + -DARROW_FILESYSTEM=ON ^ -DARROW_FLIGHT=%ARROW_BUILD_FLIGHT% ^ -DARROW_FLIGHT_SQL=%ARROW_BUILD_FLIGHT_SQL% ^ -DARROW_GANDIVA=%ARROW_BUILD_GANDIVA% ^ + -DARROW_HDFS=ON ^ + -DARROW_JSON=ON ^ -DARROW_MIMALLOC=ON ^ + -DARROW_ORC=ON ^ -DARROW_PARQUET=ON ^ - -DARROW_PYTHON=ON ^ -DARROW_S3=%ARROW_S3% ^ -DARROW_SUBSTRAIT=ON ^ -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ @@ -117,8 +83,8 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_WITH_ZLIB=ON ^ -DARROW_WITH_ZSTD=ON ^ -DCMAKE_BUILD_TYPE="Release" ^ - -DCMAKE_CXX_COMPILER=clcache ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD /Od /UNDEBUG" ^ + -DCMAKE_CXX_STANDARD=17 ^ -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^ -DCMAKE_UNITY_BUILD=ON ^ -DCMAKE_VERBOSE_MAKEFILE=OFF ^ @@ -127,13 +93,11 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ .. || exit /B cmake --build . --target install --config Release || exit /B -@rem Needed so arrow-python-test.exe works -set OLD_PYTHONHOME=%PYTHONHOME% -set PYTHONHOME=%CONDA_PREFIX% +@rem For ORC C++ +set TZDIR=%CONDA_PREFIX%\share\zoneinfo ctest --output-on-failure || exit /B -set PYTHONHOME=%OLD_PYTHONHOME% popd @rem @@ -153,6 +117,7 @@ set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% set PYARROW_WITH_STATIC_BOOST=ON +set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library @rem ARROW-3075; pkgconfig is broken for Parquet for now diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 1fa126cb0d2..64f930a1613 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -17,9 +17,7 @@ @echo on -set "PATH=C:\Miniconda37-x64;C:\Miniconda37-x64\Scripts;C:\Miniconda37-x64\Library\bin;%PATH%" -set BOOST_ROOT=C:\Libraries\boost_1_67_0 -set BOOST_LIBRARYDIR=C:\Libraries\boost_1_67_0\lib64-msvc-14.0 +set "PATH=C:\Miniconda38-x64;C:\Miniconda38-x64\Scripts;C:\Miniconda38-x64\Library\bin;%PATH%" @rem @rem Avoid picking up AppVeyor-installed OpenSSL (linker errors with gRPC) @@ -31,6 +29,8 @@ rd /s /q C:\OpenSSL-v11-Win32 rd /s /q C:\OpenSSL-v11-Win64 rd /s /q C:\OpenSSL-v111-Win32 rd /s /q C:\OpenSSL-v111-Win64 +rd /s /q C:\OpenSSL-v30-Win32 +rd /s /q C:\OpenSSL-v30-Win64 @rem @rem Configure miniconda @@ -46,15 +46,14 @@ conda info -a @rem @rem Install mamba to the base environment @rem -conda install -q -y -c conda-forge mamba python=3.9 || exit /B +conda install -q -y -c conda-forge mamba python=%PYTHON% || exit /B @rem Update for newer CA certificates mamba update -q -y -c conda-forge --all || exit /B @rem -@rem Create conda environment for Build and Toolchain jobs +@rem Create conda environment @rem -@rem Avoid Boost 1.70 because of https://github.com/boostorg/process/issues/85 set CONDA_PACKAGES= @@ -62,54 +61,33 @@ if "%ARROW_BUILD_GANDIVA%" == "ON" ( @rem Install llvmdev in the toolchain if building gandiva.dll set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_gandiva_win.txt ) -if "%JOB%" == "Toolchain" ( - @rem Install pre-built "toolchain" packages for faster builds - set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_cpp.txt -) -if "%JOB%" NEQ "Build_Debug" ( - @rem Arrow conda environment is only required for the Build and Toolchain jobs - mamba create -n arrow -q -y -c conda-forge ^ - --file=ci\conda_env_python.txt ^ - %CONDA_PACKAGES% ^ - "cmake" ^ - "ninja" ^ - "nomkl" ^ - "pandas" ^ - "fsspec" ^ - "python=%PYTHON%" ^ - || exit /B -) +@rem Install pre-built "toolchain" packages for faster builds +set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_cpp.txt +@rem Arrow conda environment +mamba create -n arrow -q -y -c conda-forge ^ + --file=ci\conda_env_python.txt ^ + %CONDA_PACKAGES% ^ + "ccache" ^ + "cmake" ^ + "ninja" ^ + "nomkl" ^ + "pandas" ^ + "fsspec" ^ + "python=%PYTHON%" ^ + || exit /B @rem @rem Configure compiler @rem -if "%GENERATOR%"=="Ninja" set need_vcvarsall=1 -if defined need_vcvarsall ( - if "%APPVEYOR_BUILD_WORKER_IMAGE%" NEQ "Visual Studio 2017" ( - @rem ARROW-14070 Visual Studio 2015 no longer supported - exit /B - ) - call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 - set CC=cl.exe - set CXX=cl.exe -) - -@rem -@rem Use clcache for faster builds -@rem - -pip install -q git+https://github.com/Nuitka/clcache.git || exit /B -@rem Limit cache size to 500 MB -clcache -M 500000000 -clcache -c -clcache -s -powershell.exe -Command "Start-Process clcache-server" || exit /B +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 +set CC=cl.exe +set CXX=cl.exe @rem @rem Download Minio somewhere on PATH, for unit tests @rem if "%ARROW_S3%" == "ON" ( - appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z -FileName C:\Windows\Minio.exe || exit /B + appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z -FileName C:\Windows\Minio.exe || exit /B ) diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index 942a3eba7a6..a446cf7e3b9 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,6 +21,9 @@ # SOFTWARE. sources: + "10.0.0": + url: "https://github.com/apache/arrow/archive/apache-arrow-10.0.0.tar.gz" + sha256: "2852b21f93ee84185a9d838809c9a9c41bf6deca741bed1744e0fdba6cc19e3f" "8.0.1": url: "https://github.com/apache/arrow/archive/apache-arrow-8.0.1.tar.gz" sha256: "e4c86329be769f2c8778aacc8d6220a9a13c90d59d4988f9349d51299dacbd11" @@ -37,60 +40,66 @@ sources: url: "https://github.com/apache/arrow/archive/apache-arrow-1.0.0.tar.gz" sha256: "08fbd4c633c08939850d619ca0224c75d7a0526467c721c0838b8aa7efccb270" patches: + "10.0.0": + - patch_file: "patches/10.0.0-0001-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/10.0.0-0002-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" "8.0.1": - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0001-cmake.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0002-jemalloc.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0003-mallctl-takes-size_t.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0004-use-find-package.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0005-install-utils.patch" + - patch_file: "patches/8.0.0-0003-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/8.0.0-0005-install-utils.patch" + patch_description: "enable utilis installation" + patch_type: "conan" + - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" "8.0.0": - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0001-cmake.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0002-jemalloc.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0003-mallctl-takes-size_t.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0004-use-find-package.patch" - - base_path: "source_subfolder" - patch_file: "patches/8.0.0-0005-install-utils.patch" + - patch_file: "patches/8.0.0-0003-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/8.0.0-0005-install-utils.patch" + patch_description: "enable utilis installation" + patch_type: "conan" + - patch_file: "patches/8.0.0-0006-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" "7.0.0": - - base_path: "source_subfolder" - patch_file: "patches/7.0.0-0001-cmake.patch" - - base_path: "source_subfolder" - patch_file: "patches/7.0.0-0002-jemalloc.patch" - - base_path: "source_subfolder" - patch_file: "patches/7.0.0-0003-mallctl-takes-size_t.patch" - - base_path: "source_subfolder" - patch_file: "patches/7.0.0-0005-use-find-package.patch" - - base_path: "source_subfolder" - patch_file: "patches/7.0.0-0006-install-utils.patch" + - patch_file: "patches/7.0.0-0003-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/7.0.0-0006-install-utils.patch" + patch_description: "enable utilis installation" + patch_type: "conan" + - patch_file: "patches/7.0.0-0007-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" "2.0.0": - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0001-cmake.patch" - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0002-jemalloc.patch" - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0003-fix-shared-msvc.patch" - - base_path: "source_subfolder" - patch_file: "patches/1.0.0-0004-mallctl-takes-size_t.patch" - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0005-gandiva-engine.patch" - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0006-gandiva-llvm-re2.patch" - - base_path: "source_subfolder" - patch_file: "patches/2.0.0-0007-fix-protoc-cmake.patch" + - patch_file: "patches/2.0.0-0003-fix-shared-msvc.patch" + patch_description: "make shared enabled in msvc" + patch_type: "backport" + - patch_file: "patches/1.0.0-0004-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/2.0.0-0005-gandiva-engine.patch" + patch_description: "fix grandiva compilation error" + patch_type: "backport" + - patch_file: "patches/2.0.0-0008-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" "1.0.0": - - base_path: "source_subfolder" - patch_file: "patches/1.0.0-0001-cmake.patch" - - base_path: "source_subfolder" - patch_file: "patches/1.0.0-0002-jemalloc.patch" - - base_path: "source_subfolder" - patch_file: "patches/1.0.0-0003-fix-shared-msvc.patch" - - base_path: "source_subfolder" - patch_file: "patches/1.0.0-0004-mallctl-takes-size_t.patch" + - patch_file: "patches/1.0.0-0003-fix-shared-msvc.patch" + patch_description: "make shared enabled in msvc" + patch_type: "backport" + - patch_file: "patches/1.0.0-0004-mallctl-takes-size_t.patch" + patch_description: "use size_t instead of ssize_t" + patch_type: "backport" + - patch_file: "patches/1.0.0-0005-fix-make12-namespace.patch" + patch_description: "fix ambiguous `make12` function between std and date" + patch_type: "backport" + - patch_file: "patches/1.0.0-0006-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index a87478d6e40..1489040ff76 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -20,13 +20,18 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from conans import ConanFile, tools, CMake -from conans.errors import ConanInvalidConfiguration +from conan import ConanFile +from conan.errors import ConanInvalidConfiguration +from conan.tools.microsoft import is_msvc_static_runtime, is_msvc, check_min_vs +from conan.tools.files import export_conandata_patches, apply_conandata_patches, get, copy, rmdir +from conan.tools.build import check_min_cppstd, cross_building +from conan.tools.scm import Version +from conan.tools.cmake import CMake, CMakeDeps, CMakeToolchain, cmake_layout + import os import glob -required_conan_version = ">=1.33.0" - +required_conan_version = ">=1.53.0" class ArrowConan(ConanFile): name = "arrow" @@ -64,6 +69,7 @@ class ArrowConan(ConanFile): "with_glog": ["auto", True, False], "with_grpc": ["auto", True, False], "with_jemalloc": ["auto", True, False], + "with_mimalloc": ["auto", True, False], "with_json": [True, False], "with_llvm": ["auto", True, False], "with_openssl": ["auto", True, False], @@ -108,6 +114,7 @@ class ArrowConan(ConanFile): "with_gcs": False, "with_gflags": "auto", "with_jemalloc": "auto", + "with_mimalloc": False, "with_glog": "auto", "with_grpc": "auto", "with_json": False, @@ -124,40 +131,61 @@ class ArrowConan(ConanFile): "with_zlib": False, "with_zstd": False, } - generators = "cmake", "cmake_find_package_multi" short_paths = True - _cmake = None + @property + def _minimum_cpp_standard(self): + # arrow >= 10.0.0 requires C++17. + # https://github.com/apache/arrow/pull/13991 + return 11 if Version(self.version) < "10.0.0" else 17 @property - def _source_subfolder(self): - return "source_subfolder" + def _compilers_minimum_version(self): + return { + "gcc": "8", + "clang": "7", + "apple-clang": "10", + } def export_sources(self): - self.copy("CMakeLists.txt") - for patch in self.conan_data.get("patches", {}).get(self.version, []): - self.copy(patch["patch_file"]) + export_conandata_patches(self) def config_options(self): if self.settings.os == "Windows": del self.options.fPIC - if tools.Version(self.version) < "2.0.0": + if Version(self.version) < "2.0.0": del self.options.simd_level del self.options.runtime_simd_level - elif tools.Version(self.version) < "6.0.0": + elif Version(self.version) < "6.0.0": self.options.simd_level = "sse4_2" - if tools.Version(self.version) < "6.0.0": + if Version(self.version) < "6.0.0": del self.options.with_gcs - if tools.Version(self.version) < "7.0.0": + if Version(self.version) < "7.0.0": del self.options.skyhook del self.options.with_flight_sql del self.options.with_opentelemetry - if tools.Version(self.version) < "8.0.0": + if Version(self.version) < "8.0.0": del self.options.substrait + def configure(self): + if self.options.shared: + self.options.rm_safe("fPIC") + def validate(self): - if self.settings.compiler == "clang" and self.settings.compiler.version <= tools.Version("3.9"): - raise ConanInvalidConfiguration("This recipe does not support this compiler version") + if self.info.settings.compiler.cppstd: + check_min_cppstd(self, self._minimum_cpp_standard) + + if self._minimum_cpp_standard == 11: + if self.info.settings.compiler == "clang" and self.info.settings.compiler.version <= Version("3.9"): + raise ConanInvalidConfiguration("This recipe does not support this compiler version") + else: + check_min_vs(self, 191) + if not is_msvc(self): + minimum_version = self._compilers_minimum_version.get(str(self.info.settings.compiler), False) + if minimum_version and Version(self.info.settings.compiler.version) < minimum_version: + raise ConanInvalidConfiguration( + f"{self.ref} requires C++{self._minimum_cpp_standard}, which your compiler does not support." + ) if self.options.shared: del self.options.fPIC @@ -186,7 +214,7 @@ def validate(self): if self.options.with_openssl == False and self._with_openssl(True): raise ConanInvalidConfiguration("with_openssl options is required (or choose auto)") if self.options.with_llvm == False and self._with_llvm(True): - raise ConanInvalidConfiguration("with_openssl options is required (or choose auto)") + raise ConanInvalidConfiguration("with_llvm options is required (or choose auto)") if self.options.with_cuda: raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") if self.options.with_orc: @@ -198,8 +226,11 @@ def validate(self): if self.options["jemalloc"].enable_cxx: raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") - if tools.Version(self.version) < "6.0.0" and self.options.get_safe("simd_level") == "default": - raise ConanInvalidConfiguration("In {}/{}, simd_level options is not supported `default` value.".format(self.name, self.version)) + if Version(self.version) < "6.0.0" and self.options.get_safe("simd_level") == "default": + raise ConanInvalidConfiguration(f"In {self.ref}, simd_level options is not supported `default` value.") + + def layout(self): + cmake_layout(self, src_folder="src") def _compute(self, required=False): if required or self.options.compute == "auto": @@ -227,7 +258,11 @@ def _with_jemalloc(self, required=False): def _with_re2(self, required=False): if required or self.options.with_re2 == "auto": - return bool(self.options.gandiva) or bool(self._compute()) + if self.options.gandiva or self.options.parquet: + return True + if Version(self) >= "7.0.0" and (self._compute() or self._dataset_modules()): + return True + return False else: return bool(self.options.with_re2) @@ -265,12 +300,12 @@ def _with_boost(self, required=False): if required or self.options.with_boost == "auto": if self.options.gandiva: return True - version = tools.Version(self.version) + version = Version(self.version) if version.major == "1": - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < tools.Version("4.9"): + if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): return True elif version.major >= "2": - if self.settings.compiler == "Visual Studio": + if is_msvc(self): return True return False else: @@ -298,15 +333,24 @@ def _with_openssl(self, required=False): else: return bool(self.options.with_openssl) + def _with_rapidjson(self): + if self.options.with_json: + return True + if Version(self.version) >= "7.0.0" and self.options.encryption: + return True + return False + def requirements(self): if self._with_thrift(): - self.requires("thrift/0.16.0") + self.requires("thrift/0.17.0") if self._with_protobuf(): - self.requires("protobuf/3.21.1") + self.requires("protobuf/3.21.4") if self._with_jemalloc(): - self.requires("jemalloc/5.2.1") + self.requires("jemalloc/5.3.0") + if self.options.with_mimalloc: + self.requires("mimalloc/1.7.6") if self._with_boost(): - self.requires("boost/1.79.0") + self.requires("boost/1.80.0") if self._with_gflags(): self.requires("gflags/2.2.2") if self._with_glog(): @@ -314,19 +358,19 @@ def requirements(self): if self.options.get_safe("with_gcs"): self.requires("google-cloud-cpp/1.40.1") if self._with_grpc(): - self.requires("grpc/1.47.0") - if self.options.with_json: + self.requires("grpc/1.50.0") + if self._with_rapidjson(): self.requires("rapidjson/1.1.0") if self._with_llvm(): self.requires("llvm-core/13.0.0") if self._with_openssl(): - # aws-sdk-cpp/grpc requires openssl/1.1.1. it uses deprecated functions in openssl/3.0.0 - if self.options.with_s3 or self._with_flight_rpc(): - self.requires("openssl/1.1.1q") + # aws-sdk-cpp requires openssl/1.1.1. it uses deprecated functions in openssl/3.0.0 + if self.options.with_s3: + self.requires("openssl/1.1.1s") else: - self.requires("openssl/3.0.5") + self.requires("openssl/1.1.1s") if self.options.get_safe("with_opentelemetry"): - self.requires("opentelemetry-cpp/1.4.1") + self.requires("opentelemetry-cpp/1.7.0") if self.options.with_s3: self.requires("aws-sdk-cpp/1.9.234") if self.options.with_brotli: @@ -334,21 +378,21 @@ def requirements(self): if self.options.with_bz2: self.requires("bzip2/1.0.8") if self.options.with_lz4: - self.requires("lz4/1.9.3") + self.requires("lz4/1.9.4") if self.options.with_snappy: self.requires("snappy/1.1.9") - if tools.Version(self.version) >= "6.0.0" and \ + if Version(self.version) >= "6.0.0" and \ self.options.get_safe("simd_level") != None or \ self.options.get_safe("runtime_simd_level") != None: - self.requires("xsimd/8.1.0") + self.requires("xsimd/9.0.1") if self.options.with_zlib: - self.requires("zlib/1.2.12") + self.requires("zlib/1.2.13") if self.options.with_zstd: self.requires("zstd/1.5.2") if self._with_re2(): - self.requires("re2/20220201") + self.requires("re2/20220601") if self._with_utf8proc(): - self.requires("utf8proc/2.7.0") + self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") @@ -360,9 +404,9 @@ def source(self): import shutil top_level = os.environ.get("ARROW_HOME") shutil.copytree(os.path.join(top_level, "cpp"), - os.path.join(self._source_subfolder, "cpp")) + os.path.join(self.source_folder, "cpp")) shutil.copytree(os.path.join(top_level, "format"), - os.path.join(self._source_subfolder, "format")) + os.path.join(self.source_folder, "format")) top_level_files = [ ".env", "LICENSE.txt", @@ -370,175 +414,176 @@ def source(self): ] for top_level_file in top_level_files: shutil.copy(os.path.join(top_level, top_level_file), - self._source_subfolder) + self.source_folder) return # END - tools.get(**self.conan_data["sources"][self.version], - destination=self._source_subfolder, strip_root=True) - - def _configure_cmake(self): - if self._cmake: - return self._cmake - self._cmake = CMake(self) - self._cmake.definitions["CMAKE_FIND_PACKAGE_PREFER_CONFIG"] = True - if tools.cross_building(self): + get(self, **self.conan_data["sources"][self.version], + destination=self.source_folder, strip_root=True) + + def generate(self): + # BUILD_SHARED_LIBS and POSITION_INDEPENDENT_CODE are automatically parsed when self.options.shared or self.options.fPIC exist + tc = CMakeToolchain(self) + if cross_building(self): cmake_system_processor = { "armv8": "aarch64", "armv8.3": "aarch64", }.get(str(self.settings.arch), str(self.settings.arch)) - self._cmake.definitions["CMAKE_SYSTEM_PROCESSOR"] = cmake_system_processor - if self.settings.compiler == "Visual Studio": - self._cmake.definitions["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) - self._cmake.definitions["ARROW_DEFINE_OPTIONS"] = True - self._cmake.definitions["ARROW_DEPENDENCY_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_GANDIVA"] = self.options.gandiva - self._cmake.definitions["ARROW_PARQUET"] = self._parquet() - self._cmake.definitions["ARROW_SUBSTRAIT"] = self.options.get_safe("substrait", False) - self._cmake.definitions["ARROW_PLASMA"] = self.options.plasma - self._cmake.definitions["ARROW_DATASET"] = self._dataset_modules() - self._cmake.definitions["ARROW_FILESYSTEM"] = self.options.filesystem_layer - self._cmake.definitions["PARQUET_REQUIRE_ENCRYPTION"] = self.options.encryption - self._cmake.definitions["ARROW_HDFS"] = self.options.hdfs_bridgs - self._cmake.definitions["ARROW_VERBOSE_THIRDPARTY_BUILD"] = True - self._cmake.definitions["ARROW_BUILD_SHARED"] = self.options.shared - self._cmake.definitions["ARROW_BUILD_STATIC"] = not self.options.shared - self._cmake.definitions["ARROW_NO_DEPRECATED_API"] = not self.options.deprecated - self._cmake.definitions["ARROW_FLIGHT"] = self._with_flight_rpc() - self._cmake.definitions["ARROW_FLIGHT_SQL"] = self.options.get_safe("with_flight_sql", False) - self._cmake.definitions["ARROW_COMPUTE"] = self._compute() - self._cmake.definitions["ARROW_CSV"] = self.options.with_csv - self._cmake.definitions["ARROW_CUDA"] = self.options.with_cuda - self._cmake.definitions["ARROW_JEMALLOC"] = self._with_jemalloc() - self._cmake.definitions["jemalloc_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_JSON"] = self.options.with_json - - self._cmake.definitions["BOOST_SOURCE"] = "SYSTEM" - self._cmake.definitions["Protobuf_SOURCE"] = "SYSTEM" + tc.variables["CMAKE_SYSTEM_PROCESSOR"] = cmake_system_processor + if cmake_system_processor == "aarch64": + tc.variables["ARROW_CPU_FLAG"] = "armv8" + if is_msvc(self): + tc.variables["ARROW_USE_STATIC_CRT"] = is_msvc_static_runtime(self) + tc.variables["ARROW_DEPENDENCY_SOURCE"] = "SYSTEM" + tc.variables["ARROW_PACKAGE_KIND"] = "conan" + tc.variables["ARROW_GANDIVA"] = bool(self.options.gandiva) + tc.variables["ARROW_PARQUET"] = self._parquet() + tc.variables["ARROW_SUBSTRAIT"] = bool(self.options.get_safe("substrait", False)) + tc.variables["ARROW_PLASMA"] = bool(self.options.plasma) + tc.variables["ARROW_DATASET"] = self._dataset_modules() + tc.variables["ARROW_FILESYSTEM"] = bool(self.options.filesystem_layer) + tc.variables["PARQUET_REQUIRE_ENCRYPTION"] = bool(self.options.encryption) + tc.variables["ARROW_HDFS"] = bool(self.options.hdfs_bridgs) + tc.variables["ARROW_VERBOSE_THIRDPARTY_BUILD"] = True + tc.variables["ARROW_BUILD_SHARED"] = bool(self.options.shared) + tc.variables["ARROW_BUILD_STATIC"] = not bool(self.options.shared) + tc.variables["ARROW_NO_DEPRECATED_API"] = not bool(self.options.deprecated) + tc.variables["ARROW_FLIGHT"] = self._with_flight_rpc() + tc.variables["ARROW_FLIGHT_SQL"] = bool(self.options.get_safe("with_flight_sql", False)) + tc.variables["ARROW_COMPUTE"] = self._compute() + tc.variables["ARROW_CSV"] = bool(self.options.with_csv) + tc.variables["ARROW_CUDA"] = bool(self.options.with_cuda) + tc.variables["ARROW_JEMALLOC"] = self._with_jemalloc() + tc.variables["ARROW_MIMALLOC"] = bool(self.options.with_mimalloc) + tc.variables["jemalloc_SOURCE"] = "SYSTEM" + tc.variables["ARROW_JSON"] = bool(self.options.with_json) + tc.variables["google_cloud_cpp_SOURCE"] = "SYSTEM" + tc.variables["ARROW_GCS"] = bool(self.options.get_safe("with_gcs", False)) + tc.variables["BOOST_SOURCE"] = "SYSTEM" + tc.variables["Protobuf_SOURCE"] = "SYSTEM" if self._with_protobuf(): - self._cmake.definitions["ARROW_PROTOBUF_USE_SHARED"] = self.options["protobuf"].shared - self._cmake.definitions["gRPC_SOURCE"] = "SYSTEM" + tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.options["protobuf"].shared) + tc.variables["gRPC_SOURCE"] = "SYSTEM" if self._with_grpc(): - self._cmake.definitions["ARROW_GRPC_USE_SHARED"] = self.options["grpc"].shared - self._cmake.definitions["ARROW_HDFS"] = self.options.hdfs_bridgs - self._cmake.definitions["ARROW_USE_GLOG"] = self._with_glog() - self._cmake.definitions["GLOG_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_WITH_BACKTRACE"] = self.options.with_backtrace - self._cmake.definitions["ARROW_WITH_BROTLI"] = self.options.with_brotli - self._cmake.definitions["Brotli_SOURCE"] = "SYSTEM" + tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.options["grpc"].shared) + + tc.variables["ARROW_USE_GLOG"] = self._with_glog() + tc.variables["GLOG_SOURCE"] = "SYSTEM" + tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) + tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) + tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: - self._cmake.definitions["ARROW_BROTLI_USE_SHARED"] = self.options["brotli"].shared - self._cmake.definitions["gflags_SOURCE"] = "SYSTEM" + tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.options["brotli"].shared) + tc.variables["gflags_SOURCE"] = "SYSTEM" if self._with_gflags(): - self._cmake.definitions["ARROW_GFLAGS_USE_SHARED"] = self.options["gflags"].shared - self._cmake.definitions["ARROW_WITH_BZ2"] = self.options.with_bz2 - self._cmake.definitions["BZip2_SOURCE"] = "SYSTEM" + tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.options["gflags"].shared) + tc.variables["ARROW_WITH_BZ2"] = bool(self.options.with_bz2) + tc.variables["BZip2_SOURCE"] = "SYSTEM" if self.options.with_bz2: - self._cmake.definitions["ARROW_BZ2_USE_SHARED"] = self.options["bzip2"].shared - self._cmake.definitions["ARROW_WITH_LZ4"] = self.options.with_lz4 - if tools.Version(self.version) >= "9.0.0": - self._cmake.definitions["lz4_SOURCE"] = "SYSTEM" - else: - self._cmake.definitions["Lz4_SOURCE"] = "SYSTEM" + tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.options["bzip2"].shared) + tc.variables["ARROW_WITH_LZ4"] = bool(self.options.with_lz4) + tc.variables["lz4_SOURCE"] = "SYSTEM" if self.options.with_lz4: - self._cmake.definitions["ARROW_LZ4_USE_SHARED"] = self.options["lz4"].shared - self._cmake.definitions["ARROW_WITH_SNAPPY"] = self.options.with_snappy - self._cmake.definitions["Snappy_SOURCE"] = "SYSTEM" + tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.options["lz4"].shared) + tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy) + tc.variables["RapidJSON_SOURCE"] = "SYSTEM" + tc.variables["Snappy_SOURCE"] = "SYSTEM" if self.options.with_snappy: - self._cmake.definitions["ARROW_SNAPPY_USE_SHARED"] = self.options["snappy"].shared - self._cmake.definitions["ARROW_WITH_ZLIB"] = self.options.with_zlib - self._cmake.definitions["RE2_SOURCE"] = "SYSTEM" - self._cmake.definitions["ZLIB_SOURCE"] = "SYSTEM" - - self._cmake.definitions["ARROW_WITH_ZSTD"] = self.options.with_zstd - if tools.Version(self.version) >= "2.0": - self._cmake.definitions["zstd_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() - self._cmake.definitions["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() + tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.options["snappy"].shared) + tc.variables["ARROW_WITH_ZLIB"] = bool(self.options.with_zlib) + tc.variables["re2_SOURCE"] = "SYSTEM" + tc.variables["ZLIB_SOURCE"] = "SYSTEM" + tc.variables["xsimd_SOURCE"] = "SYSTEM" + tc.variables["ARROW_WITH_ZSTD"] = bool(self.options.with_zstd) + if Version(self.version) >= "2.0": + tc.variables["zstd_SOURCE"] = "SYSTEM" + tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() + tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() else: - self._cmake.definitions["ZSTD_SOURCE"] = "SYSTEM" + tc.variables["ZSTD_SOURCE"] = "SYSTEM" if self.options.with_zstd: - self._cmake.definitions["ARROW_ZSTD_USE_SHARED"] = self.options["zstd"].shared - self._cmake.definitions["ORC_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_WITH_THRIFT"] = self._with_thrift() - self._cmake.definitions["Thrift_SOURCE"] = "SYSTEM" + tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.options["zstd"].shared) + tc.variables["ORC_SOURCE"] = "SYSTEM" + tc.variables["ARROW_WITH_THRIFT"] = self._with_thrift() + tc.variables["Thrift_SOURCE"] = "SYSTEM" if self._with_thrift(): - self._cmake.definitions["THRIFT_VERSION"] = self.deps_cpp_info["thrift"].version # a recent thrift does not require boost - self._cmake.definitions["ARROW_THRIFT_USE_SHARED"] = self.options["thrift"].shared - self._cmake.definitions["ARROW_USE_OPENSSL"] = self._with_openssl() + tc.variables["THRIFT_VERSION"] = bool(self.deps_cpp_info["thrift"].version) # a recent thrift does not require boost + tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.options["thrift"].shared) + tc.variables["ARROW_USE_OPENSSL"] = self._with_openssl() if self._with_openssl(): - self._cmake.definitions["OPENSSL_ROOT_DIR"] = self.deps_cpp_info["openssl"].rootpath.replace("\\", "/") - self._cmake.definitions["ARROW_OPENSSL_USE_SHARED"] = self.options["openssl"].shared + tc.variables["OPENSSL_ROOT_DIR"] = self.deps_cpp_info["openssl"].rootpath.replace("\\", "/") + tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.options["openssl"].shared) if self._with_boost(): - self._cmake.definitions["ARROW_BOOST_USE_SHARED"] = self.options["boost"].shared - self._cmake.definitions["ARROW_S3"] = self.options.with_s3 - self._cmake.definitions["AWSSDK_SOURCE"] = "SYSTEM" - - self._cmake.definitions["ARROW_BUILD_UTILITIES"] = self.options.cli - self._cmake.definitions["ARROW_BUILD_INTEGRATION"] = False - self._cmake.definitions["ARROW_INSTALL_NAME_RPATH"] = False - self._cmake.definitions["ARROW_BUILD_EXAMPLES"] = False - self._cmake.definitions["ARROW_BUILD_TESTS"] = False - self._cmake.definitions["ARROW_ENABLE_TIMING_TESTS"] = False - self._cmake.definitions["ARROW_BUILD_BENCHMARKS"] = False - self._cmake.definitions["LLVM_SOURCE"] = "SYSTEM" - self._cmake.definitions["ARROW_WITH_UTF8PROC"] = self._with_utf8proc() - self._cmake.definitions["utf8proc_SOURCE"] = "SYSTEM" + tc.variables["ARROW_USE_BOOST"] = True + tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.options["boost"].shared) + tc.variables["ARROW_S3"] = bool(self.options.with_s3) + tc.variables["AWSSDK_SOURCE"] = "SYSTEM" + tc.variables["ARROW_BUILD_UTILITIES"] = bool(self.options.cli) + tc.variables["ARROW_BUILD_INTEGRATION"] = False + tc.variables["ARROW_INSTALL_NAME_RPATH"] = False + tc.variables["ARROW_BUILD_EXAMPLES"] = False + tc.variables["ARROW_BUILD_TESTS"] = False + tc.variables["ARROW_ENABLE_TIMING_TESTS"] = False + tc.variables["ARROW_BUILD_BENCHMARKS"] = False + tc.variables["LLVM_SOURCE"] = "SYSTEM" + tc.variables["ARROW_WITH_UTF8PROC"] = self._with_utf8proc() + tc.variables["ARROW_BOOST_REQUIRED"] = self._with_boost() + tc.variables["utf8proc_SOURCE"] = "SYSTEM" if self._with_utf8proc(): - self._cmake.definitions["ARROW_UTF8PROC_USE_SHARED"] = self.options["utf8proc"].shared - self._cmake.definitions["BUILD_WARNING_LEVEL"] = "PRODUCTION" - if self.settings.compiler == "Visual Studio": - self._cmake.definitions["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) - + tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.options["utf8proc"].shared) + tc.variables["BUILD_WARNING_LEVEL"] = "PRODUCTION" + if is_msvc(self): + tc.variables["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) if self._with_llvm(): - self._cmake.definitions["LLVM_DIR"] = self.deps_cpp_info["llvm-core"].rootpath.replace("\\", "/") - self._cmake.configure() - return self._cmake + tc.variables["LLVM_DIR"] = self.deps_cpp_info["llvm-core"].rootpath.replace("\\", "/") + tc.generate() + + deps = CMakeDeps(self) + deps.generate() def _patch_sources(self): - for patch in self.conan_data.get("patches", {}).get(self.version, []): - tools.patch(**patch) - # if tools.Version(self.version) >= "7.0.0": - # for filename in glob.glob(os.path.join(self._source_subfolder, "cpp", "cmake_modules", "Find*.cmake")): - # if os.path.basename(filename) not in [ - # "FindArrow.cmake", - # "FindArrowCUDA.cmake", - # "FindArrowDataset.cmake", - # "FindArrowFlight.cmake", - # "FindArrowFlightSql.cmake", - # "FindArrowFlightTesting.cmake", - # "FindArrowPython.cmake", - # "FindArrowPythonFlight.cmake", - # "FindArrowSubstrait.cmake", - # "FindArrowTesting.cmake", - # "FindGandiva.cmake", - # "FindParquet.cmake", - # "FindPlasma.cmake", - # ]: - # os.remove(filename) + apply_conandata_patches(self) + if Version(self.version) >= "7.0.0" and Version(self.version) < "11.0.0": + for filename in glob.glob(os.path.join(self.source_folder, "cpp", "cmake_modules", "Find*.cmake")): + if os.path.basename(filename) not in [ + "FindArrow.cmake", + "FindArrowCUDA.cmake", + "FindArrowDataset.cmake", + "FindArrowFlight.cmake", + "FindArrowFlightSql.cmake", + "FindArrowFlightTesting.cmake", + "FindArrowPython.cmake", + "FindArrowPythonFlight.cmake", + "FindArrowSubstrait.cmake", + "FindArrowTesting.cmake", + "FindGandiva.cmake", + "FindParquet.cmake", + "FindPlasma.cmake", + ]: + os.remove(filename) def build(self): self._patch_sources() - cmake = self._configure_cmake() + cmake =CMake(self) + cmake.configure(build_script_folder=os.path.join(self.source_folder, "cpp")) cmake.build() def package(self): - self.copy("LICENSE.txt", src=self._source_subfolder, dst="licenses") - self.copy("NOTICE.txt", src=self._source_subfolder, dst="licenses") - cmake = self._configure_cmake() + copy(self, pattern="LICENSE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) + copy(self, pattern="NOTICE.txt", dst=os.path.join(self.package_folder, "licenses"), src=self.source_folder) + cmake =CMake(self) cmake.install() - tools.rmdir(os.path.join(self.package_folder, "lib", "cmake")) - tools.rmdir(os.path.join(self.package_folder, "lib", "pkgconfig")) - tools.rmdir(os.path.join(self.package_folder, "share")) + rmdir(self, os.path.join(self.package_folder, "lib", "cmake")) + rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) + rmdir(self, os.path.join(self.package_folder, "share")) def _lib_name(self, name): - if self.settings.compiler == "Visual Studio" and not self.options.shared: + if is_msvc(self) and not self.options.shared: return "{}_static".format(name) else: return "{}".format(name) def package_id(self): - self.info.options.with_jemalloc = self._with_jemalloc() self.info.options.with_gflags = self._with_gflags() self.info.options.with_protobuf = self._with_protobuf() self.info.options.with_re2 = self._with_re2() @@ -607,16 +652,16 @@ def package_info(self): if (self.options.cli and (self.options.with_cuda or self._with_flight_rpc() or self._parquet())) or self.options.plasma: binpath = os.path.join(self.package_folder, "bin") - self.output.info("Appending PATH env var: {}".format(binpath)) + self.output.info(f"Appending PATH env var: {binpath}") self.env_info.PATH.append(binpath) if self._with_boost(): if self.options.gandiva: # FIXME: only filesystem component is used self.cpp_info.components["libgandiva"].requires.append("boost::boost") - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < tools.Version("4.9"): + if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): self.cpp_info.components["libparquet"].requires.append("boost::boost") - if tools.Version(self.version) >= "2.0": + if Version(self.version) >= "2.0": # FIXME: only headers components is used self.cpp_info.components["libarrow"].requires.append("boost::boost") if self._with_openssl(): @@ -627,6 +672,8 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("glog::glog") if self._with_jemalloc(): self.cpp_info.components["libarrow"].requires.append("jemalloc::jemalloc") + if self.options.with_mimalloc: + self.cpp_info.components["libarrow"].requires.append("mimalloc::mimalloc") if self._with_re2(): self.cpp_info.components["libgandiva"].requires.append("re2::re2") if self._with_llvm(): @@ -641,7 +688,7 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace") if self.options.with_cuda: self.cpp_info.components["libarrow"].requires.append("cuda::cuda") - if self.options.with_json: + if self._with_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") @@ -665,6 +712,9 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") + if self._with_boost(): + self.cpp_info.components["libarrow"].requires.append("boost::boost") + if self._with_grpc(): + self.cpp_info.components["libarrow"].requires.append("grpc::grpc") if self._with_flight_rpc(): - self.cpp_info.components["libarrow_flight"].requires.append("grpc::grpc") self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") diff --git a/ci/conan/all/patches/1.0.0-0001-cmake.patch b/ci/conan/all/patches/1.0.0-0001-cmake.patch deleted file mode 100644 index 9da894a127b..00000000000 --- a/ci/conan/all/patches/1.0.0-0001-cmake.patch +++ /dev/null @@ -1,114 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/cmake_modules/DefineOptions.cmake -+++ cpp/cmake_modules/DefineOptions.cmake -@@ -76,7 +76,7 @@ macro(define_option_string name description default) - endmacro() - - # Top level cmake dir --if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") -+if(1) - #---------------------------------------------------------------------- - set_option_category("Compile and link") - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -854,7 +854,7 @@ if(ARROW_WITH_SNAPPY) - # location. - # https://bugzilla.redhat.com/show_bug.cgi?id=1679727 - # https://src.fedoraproject.org/rpms/snappy/pull-request/1 -- find_package(Snappy QUIET HINTS "${CMAKE_ROOT}/Modules/") -+ find_package(Snappy REQUIRED) - if(NOT Snappy_FOUND) - find_package(SnappyAlt) - endif() -@@ -866,7 +866,7 @@ - elseif(Snappy_SOURCE STREQUAL "SYSTEM") - # SnappyConfig.cmake is not installed on Ubuntu/Debian - # TODO: Make a bug report upstream -- find_package(Snappy HINTS "${CMAKE_ROOT}/Modules/") -+ find_package(Snappy REQUIRED) - if(NOT Snappy_FOUND) - find_package(SnappyAlt REQUIRED) - endif() -@@ -1139,8 +1139,8 @@ - build_gflags() - elseif(gflags_SOURCE STREQUAL "SYSTEM") -- # gflagsConfig.cmake is not installed on Ubuntu/Debian -- # TODO: Make a bug report upstream -- find_package(gflags ${ARROW_GFLAGS_REQUIRED_VERSION}) -+ find_package(gflags REQUIRED) -+ add_library(gflags-shared INTERFACE) -+ target_link_libraries(gflags-shared INTERFACE gflags::gflags) - if(NOT gflags_FOUND) - find_package(gflagsAlt ${ARROW_GFLAGS_REQUIRED_VERSION} REQUIRED) - endif() -@@ -1329,6 +1329,6 @@ macro(build_protobuf) - endmacro() -- - if(ARROW_WITH_PROTOBUF) -+ find_package(Protobuf REQUIRED) - if(ARROW_WITH_GRPC) - # gRPC 1.21.0 or later require Protobuf 3.7.0 or later. - set(ARROW_PROTOBUF_REQUIRED_VERSION "3.7.0") -@@ -1365,9 +1365,9 @@ if(ARROW_WITH_PROTOBUF) - set(ARROW_PROTOBUF_LIBPROTOC arrow::protobuf::libprotoc) - else() - if(NOT TARGET protobuf::libprotoc) -+ set(Protobuf_PROTOC_LIBRARY protoc) - if(PROTOBUF_PROTOC_LIBRARY AND NOT Protobuf_PROTOC_LIBRARY) -- # Old CMake versions have a different casing. -- set(Protobuf_PROTOC_LIBRARY ${PROTOBUF_PROTOC_LIBRARY}) -+ set(Protobuf_PROTOC_LIBRARY protoc) - endif() - if(NOT Protobuf_PROTOC_LIBRARY) - message(FATAL_ERROR "libprotoc was set to ${Protobuf_PROTOC_LIBRARY}") -@@ -1802,7 +1802,7 @@ if(ARROW_WITH_RAPIDJSON) - elseif(RapidJSON_SOURCE STREQUAL "SYSTEM") - # Fedora packages place the package information at the wrong location. - # https://bugzilla.redhat.com/show_bug.cgi?id=1680400 -- find_package(RapidJSON ${ARROW_RAPIDJSON_REQUIRED_VERSION} HINTS "${CMAKE_ROOT}") -+ find_package(RapidJSON REQUIRED) - if(RapidJSON_FOUND) - set(RAPIDJSON_INCLUDE_DIR ${RAPIDJSON_INCLUDE_DIRS}) - else() -@@ -2088,7 +2088,7 @@ if(ARROW_WITH_BZ2) - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() -- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") - endif() - - macro(build_utf8proc) ---- cpp/cmake_modules/SetupCxxFlags.cmake -+++ cpp/cmake_modules/SetupCxxFlags.cmake -@@ -188,7 +188,7 @@ - message(STATUS "Arrow build warning level: ${BUILD_WARNING_LEVEL}") - - macro(arrow_add_werror_if_debug) -- if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") -+ if(0) - # Treat all compiler warnings as errors - if(MSVC) - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /WX") diff --git a/ci/conan/all/patches/1.0.0-0002-jemalloc.patch b/ci/conan/all/patches/1.0.0-0002-jemalloc.patch deleted file mode 100644 index 30402fd4b0f..00000000000 --- a/ci/conan/all/patches/1.0.0-0002-jemalloc.patch +++ /dev/null @@ -1,65 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -1407,6 +1407,6 @@ endif() - # jemalloc - Unix-only high-performance allocator -- - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1465,6 +1465,8 @@ if(ARROW_JEMALLOC) - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED) -+endif() - endif() -- - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -292,7 +292,7 @@ - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) - list(APPEND _allocator_dependencies mimalloc_ep) ---- cpp/src/arrow/memory_pool.cc -+++ cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch b/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch new file mode 100644 index 00000000000..199804bff00 --- /dev/null +++ b/ci/conan/all/patches/1.0.0-0005-fix-make12-namespace.patch @@ -0,0 +1,44 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/src/arrow/vendored/datetime/date.h b/cpp/src/arrow/vendored/datetime/date.h +index 02a4909..2b168d2 100644 +--- a/cpp/src/arrow/vendored/datetime/date.h ++++ b/cpp/src/arrow/vendored/datetime/date.h +@@ -5152,7 +5152,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, + if (modified == CharT{}) + #endif + { +- auto h = *fmt == CharT{'I'} ? make12(hms.hours()) : hms.hours(); ++ auto h = *fmt == CharT{'I'} ? arrow_vendored::date::make12(hms.hours()) : hms.hours(); + if (h < hours{10}) + os << CharT{'0'}; + os << h.count(); +@@ -5366,7 +5366,7 @@ to_stream(std::basic_ostream& os, const CharT* fmt, + save_ostream _(os); + os.fill('0'); + os.width(2); +- os << make12(tod.hours()).count() << CharT{':'}; ++ os << arrow_vendored::date::make12(tod.hours()).count() << CharT{':'}; + os.width(2); + os << tod.minutes().count() << CharT{':'}; + os.width(2); diff --git a/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch new file mode 100644 index 00000000000..3ecd0bf9f39 --- /dev/null +++ b/ci/conan/all/patches/1.0.0-0006-fix-cmake.patch @@ -0,0 +1,355 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index 300f043..0127a7a 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -654,7 +654,7 @@ endif() + + if(ARROW_WITH_BROTLI) + # Order is important for static linking +- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) ++ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) + list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) + endif() +@@ -664,7 +664,7 @@ if(ARROW_WITH_BZ2) + endif() + + if(ARROW_WITH_LZ4) +- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) + endif() + + if(ARROW_WITH_SNAPPY) +@@ -800,8 +800,11 @@ endif() + + if(ARROW_MIMALLOC) + add_definitions(-DARROW_MIMALLOC) +- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) +- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) ++ if (TARGET mimalloc-static) ++ list(APPEND ARROW_LINK_LIBS mimalloc-static) ++ else() ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) ++ endif() + endif() + + # ---------------------------------------------------------------------- +diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake +index eb10ebe..9c81017 100644 +--- a/cpp/cmake_modules/BuildUtils.cmake ++++ b/cpp/cmake_modules/BuildUtils.cmake +@@ -165,10 +165,10 @@ function(create_merged_static_lib output_target) + set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) + + file(WRITE ${ar_script_path}.in "CREATE ${output_lib_path}\n") +- file(APPEND ${ar_script_path}.in "ADDLIB $\n") ++ file(APPEND ${ar_script_path}.in "ADDLIB $\n") + + foreach(lib ${ARG_TO_MERGE}) +- file(APPEND ${ar_script_path}.in "ADDLIB $\n") ++ file(APPEND ${ar_script_path}.in "ADDLIB $\n") + endforeach() + + file(APPEND ${ar_script_path}.in "SAVE\nEND\n") +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index 807e2b9..016c8db 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -154,16 +154,7 @@ macro(build_dependency DEPENDENCY_NAME) + endmacro() + + macro(resolve_dependency DEPENDENCY_NAME) +- if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") +- find_package(${DEPENDENCY_NAME} MODULE) +- if(NOT ${${DEPENDENCY_NAME}_FOUND}) +- build_dependency(${DEPENDENCY_NAME}) +- endif() +- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "BUNDLED") +- build_dependency(${DEPENDENCY_NAME}) +- elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM") +- find_package(${DEPENDENCY_NAME} REQUIRED) +- endif() ++ find_package(${DEPENDENCY_NAME} REQUIRED) + endmacro() + + macro(resolve_dependency_with_version DEPENDENCY_NAME REQUIRED_VERSION) +@@ -765,6 +756,7 @@ endif() + # - Tests need Boost at runtime. + # - S3FS and Flight benchmarks need Boost at runtime. + if(ARROW_BUILD_INTEGRATION ++ OR ARROW_BOOST_REQUIRED + OR ARROW_BUILD_TESTS + OR ARROW_GANDIVA + OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) +@@ -785,7 +777,7 @@ if(ARROW_BOOST_REQUIRED) + elseif(BOOST_SOURCE STREQUAL "BUNDLED") + build_boost() + elseif(BOOST_SOURCE STREQUAL "SYSTEM") +- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) ++ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) + endif() + + if(TARGET Boost::system) +@@ -936,11 +928,11 @@ macro(build_brotli) + endmacro() + + if(ARROW_WITH_BROTLI) +- resolve_dependency(Brotli) ++ resolve_dependency(brotli) + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon ++ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon + INTERFACE_INCLUDE_DIRECTORIES) +- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) ++ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) + endif() + + if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) +@@ -1146,9 +1138,10 @@ if(ARROW_NEED_GFLAGS) + endif() + endif() + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) ++ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) ++ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) + +- if(NOT TARGET ${GFLAGS_LIBRARIES}) ++ if(0) + if(TARGET gflags-shared) + set(GFLAGS_LIBRARIES gflags-shared) + elseif(TARGET gflags_shared) +@@ -1237,12 +1230,13 @@ endmacro() + if(ARROW_WITH_THRIFT) + # We already may have looked for Thrift earlier, when considering whether + # to build Boost, so don't look again if already found. +- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) ++ if(0) + # Thrift c++ code generated by 0.13 requires 0.11 or greater + resolve_dependency_with_version(Thrift 0.11.0) + endif() ++ find_package(Thrift CONFIG REQUIRED) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) ++ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) + endif() + + # ---------------------------------------------------------------------- +@@ -1407,6 +1401,7 @@ endif() + # jemalloc - Unix-only high-performance allocator + + if(ARROW_JEMALLOC) ++if(0) + message(STATUS "Building (vendored) jemalloc from source") + # We only use a vendored jemalloc as we want to control its version. + # Also our build of jemalloc is specially prefixed so that it will not +@@ -1465,12 +1460,18 @@ if(ARROW_JEMALLOC) + add_dependencies(jemalloc::jemalloc jemalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) ++else() ++ find_package(jemalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++if(0) + message(STATUS "Building (vendored) mimalloc from source") + # We only use a vendored mimalloc as we want to control its build options. + +@@ -1518,6 +1519,11 @@ if(ARROW_MIMALLOC) + add_dependencies(toolchain mimalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) ++else() ++ find_package(mimalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- +@@ -1918,11 +1924,16 @@ macro(build_lz4) + endmacro() + + if(ARROW_WITH_LZ4) +- resolve_dependency(Lz4) ++ resolve_dependency(lz4) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) +- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) ++ if(TARGET LZ4::lz4_static) ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) ++ else() ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) ++ endif() ++ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) + endif() + + macro(build_zstd) +@@ -2037,10 +2048,10 @@ macro(build_re2) + endmacro() + + if(ARROW_GANDIVA) +- resolve_dependency(RE2) ++ resolve_dependency(re2) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) ++ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${RE2_INCLUDE_DIR}) + endif() + +@@ -2480,17 +2491,24 @@ if(ARROW_WITH_GRPC) + endif() + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) ++ # get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) ++ if(grpc_INCLUDE_DIRS_RELEASE) ++ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) ++ elseif(grpc_INCLUDE_DIRS_DEBUG) ++ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) ++ endif() + include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) ++ include_directories(SYSTEM ${absl_INCLUDE_DIR}) ++ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) + + if(GRPC_VENDORED) + set(GRPCPP_PP_INCLUDE TRUE) + else() + # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp + # depending on the gRPC version. +- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") ++ if(EXISTS ${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE TRUE) +- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") ++ elseif(EXISTS ${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE FALSE) + else() + message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") +diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt +index 5797a78..da6bd4d 100644 +--- a/cpp/src/arrow/CMakeLists.txt ++++ b/cpp/src/arrow/CMakeLists.txt +@@ -292,10 +292,15 @@ set(ARROW_TESTING_SRCS + + set(_allocator_dependencies "") # Empty list + if(ARROW_JEMALLOC) +- list(APPEND _allocator_dependencies jemalloc_ep) ++ list(APPEND _allocator_dependencies jemalloc::jemalloc) + endif() ++ + if(ARROW_MIMALLOC) +- list(APPEND _allocator_dependencies mimalloc_ep) ++ if (TARGET mimalloc-static) ++ list(APPEND _allocator_dependencies mimalloc-static) ++ else() ++ list(APPEND _allocator_dependencies mimalloc) ++ endif() + endif() + + if(_allocator_dependencies) +diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc +index 784bf7b..8f005a5 100644 +--- a/cpp/src/arrow/memory_pool.cc ++++ b/cpp/src/arrow/memory_pool.cc +@@ -31,7 +31,7 @@ + // Needed to support jemalloc 3 and 4 + #define JEMALLOC_MANGLE + // Explicitly link to our version of jemalloc +-#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" ++#include "jemalloc/jemalloc.h" + #endif + + #ifdef ARROW_MIMALLOC +diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt +index 85e8db6..cd70c63 100644 +--- a/cpp/src/gandiva/CMakeLists.txt ++++ b/cpp/src/gandiva/CMakeLists.txt +@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) + + add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) + +-find_package(LLVMAlt REQUIRED) ++find_package(LLVM REQUIRED) + + if(LLVM_VERSION_MAJOR LESS "10") + set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +@@ -88,9 +88,16 @@ set(SRC_FILES + random_generator_holder.cc + ${GANDIVA_PRECOMPILED_CC_PATH}) + +-set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE RE2::re2) + +-set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE RE2::re2) ++ function(get_all_targets var) ++ set(targets) ++ get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR}) ++ set(${var} ${targets} PARENT_SCOPE) ++endfunction() ++ ++set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core re2::re2) ++ ++set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core re2::re2) + + if(ARROW_GANDIVA_STATIC_LIBSTDCPP + AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)) +@@ -131,7 +138,7 @@ add_arrow_lib(gandiva + arrow_dependencies + precompiled + EXTRA_INCLUDES +- $ ++ $ + SHARED_LINK_FLAGS + ${GANDIVA_SHARED_LINK_FLAGS} + SHARED_LINK_LIBS +@@ -203,7 +210,7 @@ endfunction() + + set(GANDIVA_INTERNALS_TEST_ARGUMENTS) + if(WIN32) +- list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS LLVM::LLVM_INTERFACE) ++ list(APPEND GANDIVA_INTERNALS_TEST_ARGUMENTS EXTRA_LINK_LIBS llvm-core::llvm-core) + endif() + add_gandiva_test(internals-test + SOURCES +@@ -225,9 +232,9 @@ add_gandiva_test(internals-test + decimal_type_util_test.cc + random_generator_holder_test.cc + EXTRA_DEPENDENCIES +- LLVM::LLVM_INTERFACE ++ llvm-core::llvm-core + EXTRA_INCLUDES +- $ ++ $ + ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) + + if(ARROW_GANDIVA_JAVA) diff --git a/ci/conan/all/patches/8.0.0-0001-cmake.patch b/ci/conan/all/patches/10.0.0-0001-mallctl-takes-size_t.patch similarity index 65% rename from ci/conan/all/patches/8.0.0-0001-cmake.patch rename to ci/conan/all/patches/10.0.0-0001-mallctl-takes-size_t.patch index 9e67f4a1912..3428797472c 100644 --- a/ci/conan/all/patches/8.0.0-0001-cmake.patch +++ b/ci/conan/all/patches/10.0.0-0001-mallctl-takes-size_t.patch @@ -20,16 +20,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake -index ab7d2ed..6f1e411 100644 ---- a/cpp/cmake_modules/DefineOptions.cmake -+++ b/cpp/cmake_modules/DefineOptions.cmake -@@ -82,7 +82,7 @@ macro(define_option_string name description default) - endmacro() +diff --git a/cpp/src/arrow/memory_pool_jemalloc.cc b/cpp/src/arrow/memory_pool_jemalloc.cc +index c7d73c8..34c7c63 100644 +--- a/cpp/src/arrow/memory_pool_jemalloc.cc ++++ b/cpp/src/arrow/memory_pool_jemalloc.cc +@@ -140,7 +140,7 @@ void JemallocAllocator::ReleaseUnused() { + } while (0) - # Top level cmake dir --if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") -+if(1) - #---------------------------------------------------------------------- - set_option_category("Compile and link") + Status jemalloc_set_decay_ms(int ms) { +- ssize_t decay_time_ms = static_cast(ms); ++ size_t decay_time_ms = static_cast(ms); + int err = mallctl("arenas.dirty_decay_ms", nullptr, nullptr, &decay_time_ms, + sizeof(decay_time_ms)); diff --git a/ci/conan/all/patches/10.0.0-0002-fix-cmake.patch b/ci/conan/all/patches/10.0.0-0002-fix-cmake.patch new file mode 100644 index 00000000000..15d197836ec --- /dev/null +++ b/ci/conan/all/patches/10.0.0-0002-fix-cmake.patch @@ -0,0 +1,333 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index 029f13f..3518a23 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -659,7 +659,7 @@ endif() + + if(ARROW_WITH_BROTLI) + # Order is important for static linking +- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) ++ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) + list(APPEND ARROW_SHARED_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) + if(Brotli_SOURCE STREQUAL "SYSTEM") +@@ -675,14 +675,21 @@ if(ARROW_WITH_BZ2) + endif() + + if(ARROW_WITH_LZ4) +- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) ++if (TARGET LZ4::lz4_static) ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) + if(lz4_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) + endif() ++else() ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) ++ if(lz4_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) ++ endif() ++endif() + endif() + + if(ARROW_WITH_SNAPPY) +- list(APPEND ARROW_STATIC_LINK_LIBS ${Snappy_TARGET}) ++ list(APPEND ARROW_STATIC_LINK_LIBS Snappy::snappy) + if(Snappy_SOURCE STREQUAL "SYSTEM") + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${Snappy_TARGET}) + endif() +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index b7cd31f..78f3df3 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -1162,10 +1162,12 @@ endmacro() + + if(ARROW_WITH_SNAPPY) + resolve_dependency(Snappy +- HAVE_ALT ++ USE_CONFIG + TRUE + PC_PACKAGE_NAMES + snappy) ++ ++ if(0) + if(${Snappy_SOURCE} STREQUAL "SYSTEM" AND NOT snappy_PC_FOUND) + get_target_property(SNAPPY_TYPE ${Snappy_TARGET} TYPE) + if(NOT SNAPPY_TYPE STREQUAL "INTERFACE_LIBRARY") +@@ -1180,6 +1182,9 @@ if(ARROW_WITH_SNAPPY) + string(APPEND ARROW_PC_LIBS_PRIVATE " ${SNAPPY_LIB}") + endif() + endif() ++ else() ++ string(APPEND ARROW_PC_LIBS_PRIVATE " ${Snappy_LIBRARIES}") ++ endif() + endif() + + # ---------------------------------------------------------------------- +@@ -1242,7 +1247,7 @@ macro(build_brotli) + endmacro() + + if(ARROW_WITH_BROTLI) +- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) ++ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) + endif() + + if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) +@@ -1256,7 +1261,7 @@ if(PARQUET_REQUIRE_ENCRYPTION + OR ARROW_GANDIVA) + set(OpenSSL_SOURCE "SYSTEM") + resolve_dependency(OpenSSL +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_OPENSSL_REQUIRED_VERSION}) +@@ -1399,22 +1404,14 @@ endmacro() + if(ARROW_NEED_GFLAGS) + set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") + resolve_dependency(gflags +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GFLAGS_REQUIRED_VERSION} + IS_RUNTIME_DEPENDENCY + FALSE) + +- if(NOT TARGET ${GFLAGS_LIBRARIES}) +- if(TARGET gflags::gflags_shared) +- set(GFLAGS_LIBRARIES gflags::gflags_shared) +- elseif(TARGET gflags-shared) +- set(GFLAGS_LIBRARIES gflags-shared) +- elseif(TARGET gflags_shared) +- set(GFLAGS_LIBRARIES gflags_shared) +- endif() +- endif() ++ set(GFLAGS_LIBRARIES gflags::gflags) + endif() + + # ---------------------------------------------------------------------- +@@ -1638,7 +1635,7 @@ if(ARROW_WITH_PROTOBUF) + set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") + endif() + resolve_dependency(Protobuf +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_PROTOBUF_REQUIRED_VERSION} +@@ -1770,7 +1767,7 @@ macro(build_substrait) + + add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL}) + +- set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${PROTOBUF_INCLUDE_DIR}) ++ set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${protobuf_INCLUDE_DIR}) + + add_library(substrait STATIC ${SUBSTRAIT_SOURCES}) + set_target_properties(substrait PROPERTIES POSITION_INDEPENDENT_CODE ON) +@@ -1781,6 +1778,8 @@ macro(build_substrait) + list(APPEND ARROW_BUNDLED_STATIC_LIBS substrait) + endmacro() + ++set(CMAKE_VERBOSE_MAKEFILE ON) ++ + if(ARROW_SUBSTRAIT) + # Currently, we can only build Substrait from source. + set(Substrait_SOURCE "BUNDLED") +@@ -1866,7 +1865,10 @@ macro(build_jemalloc) + endmacro() + + if(ARROW_JEMALLOC) +- resolve_dependency(jemalloc) ++ #resolve_dependency(jemalloc) ++ find_package(jemalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) + endif() + + # ---------------------------------------------------------------------- +@@ -2186,7 +2188,7 @@ endmacro() + if(ARROW_WITH_RAPIDJSON) + set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") + resolve_dependency(RapidJSON +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_RAPIDJSON_REQUIRED_VERSION} +@@ -2334,19 +2336,29 @@ macro(build_lz4) + BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) + + file(MAKE_DIRECTORY "${LZ4_PREFIX}/include") +- add_library(LZ4::lz4 STATIC IMPORTED) +- set_target_properties(LZ4::lz4 +- PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}" +- INTERFACE_INCLUDE_DIRECTORIES "${LZ4_PREFIX}/include") +- add_dependencies(toolchain lz4_ep) +- add_dependencies(LZ4::lz4 lz4_ep) +- +- list(APPEND ARROW_BUNDLED_STATIC_LIBS LZ4::lz4) ++ if (TARGET LZ4::lz4_static) ++ add_library(LZ4::lz4_static STATIC IMPORTED) ++ set_target_properties(LZ4::lz4_static ++ PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}" ++ INTERFACE_INCLUDE_DIRECTORIES "${LZ4_PREFIX}/include") ++ add_dependencies(toolchain lz4_ep) ++ add_dependencies(LZ4::lz4_static lz4_ep) ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS LZ4::lz4_static) ++ else() ++ add_library(LZ4::lz4_shared STATIC IMPORTED) ++ set_target_properties(LZ4::lz4_shared ++ PROPERTIES IMPORTED_LOCATION "${LZ4_SHARED_LIB}" ++ INTERFACE_INCLUDE_DIRECTORIES "${LZ4_PREFIX}/include") ++ add_dependencies(toolchain lz4_ep) ++ add_dependencies(LZ4::lz4_shared lz4_ep) ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS LZ4::lz4_shared) ++ endif() ++ + endmacro() + + if(ARROW_WITH_LZ4) + resolve_dependency(lz4 +- HAVE_ALT ++ USE_CONFIG + TRUE + PC_PACKAGE_NAMES + liblz4) +@@ -2415,7 +2427,7 @@ endmacro() + if(ARROW_WITH_ZSTD) + # ARROW-13384: ZSTD_minCLevel was added in v1.4.0, required by ARROW-13091 + resolve_dependency(zstd +- HAVE_ALT ++ USE_CONFIG + TRUE + PC_PACKAGE_NAMES + libzstd +@@ -2477,7 +2489,7 @@ if(ARROW_WITH_RE2) + # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may + # include -std=c++11. It's not compatible with C source and C++ + # source not uses C++ 11. +- resolve_dependency(re2 HAVE_ALT TRUE) ++ resolve_dependency(re2 USE_CONFIG TRUE) + if(${re2_SOURCE} STREQUAL "SYSTEM") + get_target_property(RE2_TYPE re2::re2 TYPE) + if(NOT RE2_TYPE STREQUAL "INTERFACE_LIBRARY") +@@ -3922,7 +3934,7 @@ if(ARROW_WITH_GRPC) + set(gRPC_SOURCE "${Protobuf_SOURCE}") + endif() + resolve_dependency(gRPC +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GRPC_REQUIRED_VERSION} +@@ -3939,9 +3951,9 @@ if(ARROW_WITH_GRPC) + get_target_property(GRPC_INCLUDE_DIR gRPC::grpc++ INTERFACE_INCLUDE_DIRECTORIES) + if(GRPC_INCLUDE_DIR MATCHES "^\\$<" + OR # generator expression +- EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") ++ EXISTS ${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE TRUE) +- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") ++ elseif(EXISTS ${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE FALSE) + else() + message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") +@@ -4282,8 +4294,11 @@ macro(build_orc) + get_target_property(ORC_SNAPPY_INCLUDE_DIR ${Snappy_TARGET} + INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(ORC_SNAPPY_ROOT "${ORC_SNAPPY_INCLUDE_DIR}" DIRECTORY) +- +- get_target_property(ORC_LZ4_ROOT LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) ++ if (TARGET LZ4::lz4_static) ++ get_target_property(ORC_LZ4_ROOT LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) ++ else() ++ get_target_property(ORC_LZ4_ROOT LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) ++ endif() + get_filename_component(ORC_LZ4_ROOT "${ORC_LZ4_ROOT}" DIRECTORY) + + get_target_property(ORC_ZSTD_ROOT ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) +@@ -4321,16 +4336,29 @@ macro(build_orc) + # Work around CMake bug + file(MAKE_DIRECTORY ${ORC_INCLUDE_DIR}) + +- externalproject_add(orc_ep +- URL ${ORC_SOURCE_URL} +- URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" +- BUILD_BYPRODUCTS ${ORC_STATIC_LIB} +- CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS} +- DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} +- ${ARROW_ZSTD_LIBZSTD} +- ${Snappy_TARGET} +- LZ4::lz4 +- ZLIB::ZLIB) ++ if (TARGET LZ4::lz4_static) ++ externalproject_add(orc_ep ++ URL ${ORC_SOURCE_URL} ++ URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" ++ BUILD_BYPRODUCTS ${ORC_STATIC_LIB} ++ CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS} ++ DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} ++ ${ARROW_ZSTD_LIBZSTD} ++ ${Snappy_TARGET} ++ LZ4::lz4_static ++ ZLIB::ZLIB) ++ else() ++ externalproject_add(orc_ep ++ URL ${ORC_SOURCE_URL} ++ URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" ++ BUILD_BYPRODUCTS ${ORC_STATIC_LIB} ++ CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS} ++ DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} ++ ${ARROW_ZSTD_LIBZSTD} ++ ${Snappy_TARGET} ++ LZ4::lz4_shared ++ ZLIB::ZLIB) ++ endif() + + set(ORC_VENDORED 1) + +@@ -4338,7 +4366,11 @@ macro(build_orc) + set_target_properties(orc::liborc + PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") +- set(ORC_LINK_LIBRARIES LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) ++ if (TARGET LZ4::lz4_static) ++ set(ORC_LINK_LIBRARIES LZ4::lz4_static ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) ++ else() ++ set(ORC_LINK_LIBRARIES LZ4::lz4_shared ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) ++ endif() + if(NOT MSVC) + if(NOT APPLE) + list(APPEND ORC_LINK_LIBRARIES Threads::Threads) +@@ -4765,7 +4797,7 @@ macro(build_awssdk) + endmacro() + + if(ARROW_S3) +- resolve_dependency(AWSSDK HAVE_ALT TRUE) ++ resolve_dependency(AWSSDK USE_CONFIG TRUE) + + message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}") + message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}") diff --git a/ci/conan/all/patches/2.0.0-0001-cmake.patch b/ci/conan/all/patches/2.0.0-0001-cmake.patch deleted file mode 100644 index a41b8e5d20c..00000000000 --- a/ci/conan/all/patches/2.0.0-0001-cmake.patch +++ /dev/null @@ -1,41 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/cmake_modules/DefineOptions.cmake -+++ cpp/cmake_modules/DefineOptions.cmake -@@ -76,7 +76,7 @@ macro(define_option_string name description default) - endmacro() - - # Top level cmake dir --if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") -+if(1) - #---------------------------------------------------------------------- - set_option_category("Compile and link") - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -1856,3 +1856,3 @@ -- find_package(RapidJSON ${ARROW_RAPIDJSON_REQUIRED_VERSION} HINTS "${CMAKE_ROOT}") -+ find_package(RapidJSON ${ARROW_RAPIDJSON_REQUIRED_VERSION} HINTS "${CMAKE_ROOT}" REQUIRED) - if(RapidJSON_FOUND) -- set(RAPIDJSON_INCLUDE_DIR ${RAPIDJSON_INCLUDE_DIRS}) -+ set(RAPIDJSON_INCLUDE_DIR ${RapidJSON_INCLUDE_DIRS}) diff --git a/ci/conan/all/patches/2.0.0-0002-jemalloc.patch b/ci/conan/all/patches/2.0.0-0002-jemalloc.patch deleted file mode 100644 index f1ff9eee78b..00000000000 --- a/ci/conan/all/patches/2.0.0-0002-jemalloc.patch +++ /dev/null @@ -1,65 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -1461,6 +1461,6 @@ - # jemalloc - Unix-only high-performance allocator -- - if(ARROW_JEMALLOC) -+if(0) - message(STATUS "Building (vendored) jemalloc from source") - # We only use a vendored jemalloc as we want to control its version. - # Also our build of jemalloc is specially prefixed so that it will not -@@ -1519,6 +1519,8 @@ - add_dependencies(jemalloc::jemalloc jemalloc_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) -+else() -+ find_package(jemalloc REQUIRED) -+endif() - endif() -- - # ---------------------------------------------------------------------- - # mimalloc - Cross-platform high-performance allocator, from Microsoft ---- cpp/src/arrow/CMakeLists.txt -+++ cpp/src/arrow/CMakeLists.txt -@@ -307,7 +307,7 @@ - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) - list(APPEND _allocator_dependencies mimalloc_ep) ---- cpp/src/arrow/memory_pool.cc -+++ cpp/src/arrow/memory_pool.cc -@@ -31,7 +31,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/2.0.0-0006-gandiva-llvm-re2.patch b/ci/conan/all/patches/2.0.0-0006-gandiva-llvm-re2.patch deleted file mode 100644 index 5e87d541a9d..00000000000 --- a/ci/conan/all/patches/2.0.0-0006-gandiva-llvm-re2.patch +++ /dev/null @@ -1,100 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/CMakeLists.txt -+++ cpp/CMakeLists.txt -@@ -109,7 +109,7 @@ set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") - set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") - set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") - --set(ARROW_LLVM_VERSIONS "10" "9" "8" "7") -+set(ARROW_LLVM_VERSIONS "12" "11" "10" "9" "8" "7") - list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) - string(REGEX - REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -2092,10 +2092,11 @@ macro(build_re2) - endmacro() - - if(ARROW_GANDIVA) -- resolve_dependency(RE2) -+ find_package(re2 REQUIRED) -+ resolve_dependency(re2) - - # TODO: Don't use global includes but rather target_include_directories -- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) -+ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) - endif() - ---- cpp/src/gandiva/CMakeLists.txt -+++ cpp/src/gandiva/CMakeLists.txt -@@ -25,8 +25,14 @@ add_custom_target(gandiva-benchmarks) - - add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) - -+# Now LLVMAlt is only for finding clang/llvm-link - find_package(LLVMAlt REQUIRED) - -+find_package(llvm-core REQUIRED) -+ -+string(REPLACE "." ";" VERSION_LIST ${llvm-core_VERSION}) -+list(GET VERSION_LIST 0 LLVM_VERSION_MAJOR) -+ - if(LLVM_VERSION_MAJOR LESS "10") - set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) - else() -@@ -88,9 +94,9 @@ set(SRC_FILES - random_generator_holder.cc - ${GANDIVA_PRECOMPILED_CC_PATH}) - --set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE RE2::re2) -+set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core re2::re2) - --set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE RE2::re2) -+set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core re2::re2) - - if(ARROW_GANDIVA_STATIC_LIBSTDCPP - AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)) -@@ -131,7 +137,7 @@ add_arrow_lib(gandiva - arrow_dependencies - precompiled - EXTRA_INCLUDES -- $ -+ $ - SHARED_LINK_FLAGS - ${GANDIVA_SHARED_LINK_FLAGS} - SHARED_LINK_LIBS -@@ -225,9 +231,9 @@ add_gandiva_test(internals-test - decimal_type_util_test.cc - random_generator_holder_test.cc - EXTRA_DEPENDENCIES -- LLVM::LLVM_INTERFACE -+ llvm-core::llvm-core - EXTRA_INCLUDES -- $ -+ $ - ${GANDIVA_INTERNALS_TEST_ARGUMENTS}) - - if(ARROW_GANDIVA_JAVA) diff --git a/ci/conan/all/patches/2.0.0-0007-fix-protoc-cmake.patch b/ci/conan/all/patches/2.0.0-0007-fix-protoc-cmake.patch deleted file mode 100644 index c7157f06e0c..00000000000 --- a/ci/conan/all/patches/2.0.0-0007-fix-protoc-cmake.patch +++ /dev/null @@ -1,33 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - ---- cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -1452,7 +1452,7 @@ if(ARROW_WITH_PROTOBUF) - message(STATUS "Found protoc: ${PROTOBUF_PROTOC_EXECUTABLE}") - # Protobuf_PROTOC_LIBRARY is set by all versions of FindProtobuf.cmake - message(STATUS "Found libprotoc: ${Protobuf_PROTOC_LIBRARY}") -- get_target_property(PROTOBUF_LIBRARY ${ARROW_PROTOBUF_LIBPROTOBUF} IMPORTED_LOCATION) -+ # get_target_property(PROTOBUF_LIBRARY ${ARROW_PROTOBUF_LIBPROTOBUF} IMPORTED_LOCATION) - message(STATUS "Found libprotobuf: ${PROTOBUF_LIBRARY}") - message(STATUS "Found protobuf headers: ${PROTOBUF_INCLUDE_DIR}") - endif() diff --git a/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch b/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch new file mode 100644 index 00000000000..abdcf7a0fa3 --- /dev/null +++ b/ci/conan/all/patches/2.0.0-0008-fix-cmake.patch @@ -0,0 +1,295 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index 515e6af..7488161 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -109,7 +109,7 @@ set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") + set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") + set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") + +-set(ARROW_LLVM_VERSIONS "10" "9" "8" "7") ++set(ARROW_LLVM_VERSIONS "13" "12" "11" "10" "9" "8" "7") + list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) + string(REGEX + REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR +@@ -667,7 +667,7 @@ endif() + + if(ARROW_WITH_BROTLI) + # Order is important for static linking +- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) ++ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) + list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) + if(Brotli_SOURCE STREQUAL "SYSTEM") +@@ -683,9 +683,9 @@ if(ARROW_WITH_BZ2) + endif() + + if(ARROW_WITH_LZ4) +- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) ++ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) + if(Lz4_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4::lz4) + endif() + endif() + +@@ -842,8 +842,14 @@ endif() + + if(ARROW_MIMALLOC) + add_definitions(-DARROW_MIMALLOC) +- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) +- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) ++ if (TARGET mimalloc-static) ++ list(APPEND ARROW_LINK_LIBS mimalloc-static) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) ++ else() ++ list(APPEND ARROW_LINK_LIBS mimalloc) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) ++ endif() ++ + endif() + + # ---------------------------------------------------------------------- +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index cc37a3c..8fe6db9 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -171,6 +171,7 @@ macro(provide_find_module DEPENDENCY_NAME) + endmacro() + + macro(resolve_dependency DEPENDENCY_NAME) ++if(0) + set(options) + set(one_value_args REQUIRED_VERSION) + cmake_parse_arguments(ARG +@@ -207,6 +208,14 @@ macro(resolve_dependency DEPENDENCY_NAME) + provide_find_module(${DEPENDENCY_NAME}) + list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) + endif() ++else() ++ if(ARG_REQUIRED_VERSION) ++ find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION} REQUIRED) ++ else() ++ find_package(${DEPENDENCY_NAME} REQUIRED) ++ endif() ++ list(APPEND ARROW_SYSTEM_DEPENDENCIES ${DEPENDENCY_NAME}) ++endif() + endmacro() + + # ---------------------------------------------------------------------- +@@ -826,6 +835,7 @@ endif() + # - Tests need Boost at runtime. + # - S3FS and Flight benchmarks need Boost at runtime. + if(ARROW_BUILD_INTEGRATION ++ OR ARROW_BOOST_REQUIRED + OR ARROW_BUILD_TESTS + OR ARROW_GANDIVA + OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) +@@ -846,7 +856,7 @@ if(ARROW_BOOST_REQUIRED) + elseif(BOOST_SOURCE STREQUAL "BUNDLED") + build_boost() + elseif(BOOST_SOURCE STREQUAL "SYSTEM") +- find_package(BoostAlt ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) ++ find_package(Boost ${ARROW_BOOST_REQUIRED_VERSION} REQUIRED) + endif() + + if(TARGET Boost::system) +@@ -973,11 +983,11 @@ macro(build_brotli) + endmacro() + + if(ARROW_WITH_BROTLI) +- resolve_dependency(Brotli) ++ resolve_dependency(brotli) + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon ++ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon + INTERFACE_INCLUDE_DIRECTORIES) +- include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) ++ include_directories(SYSTEM ${brotli_INCLUDE_DIR}) + endif() + + if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) +@@ -1200,9 +1210,10 @@ if(ARROW_NEED_GFLAGS) + endif() + endif() + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) ++ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) ++ set(GFLAGS_LIBRARIES ${gflags_LIBRARIES}) + +- if(NOT TARGET ${GFLAGS_LIBRARIES}) ++ if(0) + if(TARGET gflags-shared) + set(GFLAGS_LIBRARIES gflags-shared) + elseif(TARGET gflags_shared) +@@ -1291,12 +1302,13 @@ endmacro() + if(ARROW_WITH_THRIFT) + # We already may have looked for Thrift earlier, when considering whether + # to build Boost, so don't look again if already found. +- if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) ++ if(0) + # Thrift c++ code generated by 0.13 requires 0.11 or greater + resolve_dependency(Thrift REQUIRED_VERSION 0.11.0) + endif() ++ find_package(Thrift CONFIG REQUIRED) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) ++ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) + endif() + + # ---------------------------------------------------------------------- +@@ -1461,6 +1473,7 @@ endif() + # jemalloc - Unix-only high-performance allocator + + if(ARROW_JEMALLOC) ++if(0) + message(STATUS "Building (vendored) jemalloc from source") + # We only use a vendored jemalloc as we want to control its version. + # Also our build of jemalloc is specially prefixed so that it will not +@@ -1519,12 +1532,18 @@ if(ARROW_JEMALLOC) + add_dependencies(jemalloc::jemalloc jemalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) ++else() ++ find_package(jemalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++if(0) + message(STATUS "Building (vendored) mimalloc from source") + # We only use a vendored mimalloc as we want to control its build options. + +@@ -1572,6 +1591,11 @@ if(ARROW_MIMALLOC) + add_dependencies(toolchain mimalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) ++else() ++ find_package(mimalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- +@@ -1971,11 +1995,16 @@ macro(build_lz4) + endmacro() + + if(ARROW_WITH_LZ4) +- resolve_dependency(Lz4) ++ resolve_dependency(lz4) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) +- include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) ++ if(TARGET LZ4::lz4_static) ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) ++ else() ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) ++ endif() ++ include_directories(SYSTEM ${lz4_INCLUDE_DIR}) ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${lz4_LIBRARIES_TARGETS} ) + endif() + + macro(build_zstd) +@@ -2090,10 +2119,10 @@ macro(build_re2) + endmacro() + + if(ARROW_GANDIVA) +- resolve_dependency(RE2) ++ resolve_dependency(re2) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES) ++ get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${RE2_INCLUDE_DIR}) + endif() + +@@ -2541,17 +2570,24 @@ if(ARROW_WITH_GRPC) + endif() + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) ++ if(grpc_INCLUDE_DIRS_RELEASE) ++ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_RELEASE}) ++ elseif(grpc_INCLUDE_DIRS_DEBUG) ++ set(GRPC_INCLUDE_DIR ${grpc_INCLUDE_DIRS_DEBUG}) ++ endif() ++ + include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) ++ include_directories(SYSTEM ${absl_INCLUDE_DIR}) ++ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) + + if(GRPC_VENDORED) + set(GRPCPP_PP_INCLUDE TRUE) + else() + # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp + # depending on the gRPC version. +- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") ++ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE TRUE) +- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") ++ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE FALSE) + else() + message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") +diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt +index 2751254..842fc9e 100644 +--- a/cpp/src/arrow/CMakeLists.txt ++++ b/cpp/src/arrow/CMakeLists.txt +@@ -307,10 +307,14 @@ set(ARROW_TESTING_SRCS + + set(_allocator_dependencies "") # Empty list + if(ARROW_JEMALLOC) +- list(APPEND _allocator_dependencies jemalloc_ep) ++ list(APPEND _allocator_dependencies jemalloc::jemalloc) + endif() + if(ARROW_MIMALLOC) +- list(APPEND _allocator_dependencies mimalloc_ep) ++ if (TARGET mimalloc-static) ++ list(APPEND _allocator_dependencies mimalloc-static) ++ else() ++ list(APPEND _allocator_dependencies mimalloc) ++ endif() + endif() + + if(_allocator_dependencies) +diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc +index 784bf7b..8f005a5 100644 +--- a/cpp/src/arrow/memory_pool.cc ++++ b/cpp/src/arrow/memory_pool.cc +@@ -31,7 +31,7 @@ + // Needed to support jemalloc 3 and 4 + #define JEMALLOC_MANGLE + // Explicitly link to our version of jemalloc +-#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" ++#include "jemalloc/jemalloc.h" + #endif + + #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/7.0.0-0001-cmake.patch b/ci/conan/all/patches/7.0.0-0001-cmake.patch deleted file mode 100644 index 0c7638d67ab..00000000000 --- a/ci/conan/all/patches/7.0.0-0001-cmake.patch +++ /dev/null @@ -1,35 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake -index 0a43ec1..c468d48 100644 ---- a/cpp/cmake_modules/DefineOptions.cmake -+++ b/cpp/cmake_modules/DefineOptions.cmake -@@ -82,7 +82,7 @@ macro(define_option_string name description default) - endmacro() - - # Top level cmake dir --if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") -+if(1) - #---------------------------------------------------------------------- - set_option_category("Compile and link") - diff --git a/ci/conan/all/patches/7.0.0-0002-jemalloc.patch b/ci/conan/all/patches/7.0.0-0002-jemalloc.patch deleted file mode 100644 index 0deaba80a87..00000000000 --- a/ci/conan/all/patches/7.0.0-0002-jemalloc.patch +++ /dev/null @@ -1,48 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index b984bc1..84975e2 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -323,7 +323,7 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) - list(APPEND _allocator_dependencies mimalloc_ep) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index cf8bf64..cf8966b 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -48,7 +48,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/7.0.0-0004-remove-find-modules.patch b/ci/conan/all/patches/7.0.0-0004-remove-find-modules.patch deleted file mode 100644 index f0b299479e2..00000000000 --- a/ci/conan/all/patches/7.0.0-0004-remove-find-modules.patch +++ /dev/null @@ -1,22 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/ci/conan/all/patches/7.0.0-0005-use-find-package.patch b/ci/conan/all/patches/7.0.0-0005-use-find-package.patch deleted file mode 100644 index 0759339c23a..00000000000 --- a/ci/conan/all/patches/7.0.0-0005-use-find-package.patch +++ /dev/null @@ -1,440 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index 2d7baf1..c2e86e0 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -715,7 +715,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) - endif() -@@ -901,8 +901,8 @@ endif() - if(ARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR}) -- list(APPEND ARROW_LINK_LIBS jemalloc::jemalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS jemalloc::jemalloc) -+ list(APPEND ARROW_LINK_LIBS jemalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS jemalloc) - endif() - - if(ARROW_MIMALLOC) -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index bc38952..84fc279 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -953,14 +953,7 @@ else() - endif() - - if(ARROW_BOOST_REQUIRED) -- resolve_dependency(Boost -- HAVE_ALT -- TRUE -- REQUIRED_VERSION -- ${ARROW_BOOST_REQUIRED_VERSION} -- IS_RUNTIME_DEPENDENCY -- # libarrow.so doesn't depend on libboost*. -- FALSE) -+ find_package(Boost CONFIG REQUIRED) - - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) -@@ -1038,6 +1031,7 @@ macro(build_snappy) - endmacro() - - if(ARROW_WITH_SNAPPY) -+ if(0) - resolve_dependency(Snappy PC_PACKAGE_NAMES snappy) - if(${Snappy_SOURCE} STREQUAL "SYSTEM" AND NOT snappy_PC_FOUND) - get_target_property(SNAPPY_LIB Snappy::snappy IMPORTED_LOCATION) -@@ -1046,6 +1040,8 @@ if(ARROW_WITH_SNAPPY) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(SNAPPY_INCLUDE_DIRS Snappy::snappy INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${SNAPPY_INCLUDE_DIRS}) -+ endif() -+ find_package(Snappy REQUIRED) - endif() - - # ---------------------------------------------------------------------- -@@ -1108,7 +1104,7 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ find_package(Brotli REQUIRED) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -@@ -1156,6 +1152,15 @@ if(PARQUET_REQUIRE_ENCRYPTION - set(OpenSSL_USE_STATIC_LIBS ON) - set(OPENSSL_USE_STATIC_LIBS ON) - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) -+ find_package(OpenSSL REQUIRED CONFIG) -+ message("OPENSSL_FOUND: ${OPENSSL_FOUND}") -+ message("OPENSSL_INCLUDE_DIR: ${OPENSSL_INCLUDE_DIR}") -+ message("OPENSSL_CRYPTO_LIBRARY: ${OPENSSL_CRYPTO_LIBRARY}") -+ message("OPENSSL_CRYPTO_LIBRARIES: ${OPENSSL_CRYPTO_LIBRARIES}") -+ message("OPENSSL_SSL_LIBRARY: ${OPENSSL_SSL_LIBRARY}") -+ message("OPENSSL_SSL_LIBRARIES: ${OPENSSL_SSL_LIBRARIES}") -+ message("OPENSSL_LIBRARIES: ${OPENSSL_LIBRARIES}") -+ message("OPENSSL_VERSION: ${OPENSSL_VERSION}") - endif() - set(ARROW_USE_OPENSSL ON) - endif() -@@ -1228,10 +1233,13 @@ macro(build_glog) - endmacro() - - if(ARROW_USE_GLOG) -+ if(0) - resolve_dependency(GLOG PC_PACKAGE_NAMES libglog) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(GLOG_INCLUDE_DIR glog::glog INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) -+ endif() -+ find_package(glog REQUIRED) - endif() - - # ---------------------------------------------------------------------- -@@ -1300,17 +1308,11 @@ macro(build_gflags) - endmacro() - - if(ARROW_NEED_GFLAGS) -- set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") -- resolve_dependency(gflags -- HAVE_ALT -- TRUE -- REQUIRED_VERSION -- ${ARROW_GFLAGS_REQUIRED_VERSION} -- IS_RUNTIME_DEPENDENCY -- FALSE) -+ find_package(gflags REQUIRED) - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) - -+if(0) - if(NOT TARGET ${GFLAGS_LIBRARIES}) - if(TARGET gflags-shared) - set(GFLAGS_LIBRARIES gflags-shared) -@@ -1318,6 +1320,10 @@ if(ARROW_NEED_GFLAGS) - set(GFLAGS_LIBRARIES gflags_shared) - endif() - endif() -+else() -+ set(GFLAGS_LIBRARIES gflags::gflags) -+endif() -+ - endif() - - # ---------------------------------------------------------------------- -@@ -1400,6 +1406,7 @@ macro(build_thrift) - endmacro() - - if(ARROW_WITH_THRIFT) -+if (0) - # We already may have looked for Thrift earlier, when considering whether - # to build Boost, so don't look again if already found. - if(NOT Thrift_FOUND) -@@ -1412,6 +1419,9 @@ if(ARROW_WITH_THRIFT) - endif() - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) -+else() -+ find_package(Thrift REQUIRED CONFIG) -+endif() - - string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) - list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) -@@ -1606,7 +1616,7 @@ if(ARROW_JEMALLOC) - # conflict with the default allocator as well as other jemalloc - # installations. - # find_package(jemalloc) -- -+ if (0) - set(ARROW_JEMALLOC_USE_SHARED OFF) - set(JEMALLOC_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src/jemalloc_ep/dist/") -@@ -1664,6 +1674,9 @@ if(ARROW_JEMALLOC) - "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src") - add_dependencies(jemalloc::jemalloc jemalloc_ep) - -+ endif() -+ find_package(jemalloc REQUIRED) -+ - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) - endif() - -@@ -1671,6 +1684,8 @@ endif() - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+ if (0) -+ - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1715,6 +1730,13 @@ if(ARROW_MIMALLOC) - add_dependencies(mimalloc::mimalloc mimalloc_ep) - add_dependencies(toolchain mimalloc_ep) - -+ else() -+ -+ find_package(mimalloc CONFIG REQUIRED) -+ add_dependencies(toolchain mimalloc::mimalloc) -+ -+ endif() -+ - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) - endif() - -@@ -1999,6 +2021,7 @@ macro(build_rapidjson) - endmacro() - - if(ARROW_WITH_RAPIDJSON) -+if(0) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON - HAVE_ALT -@@ -2011,6 +2034,10 @@ if(ARROW_WITH_RAPIDJSON) - if(RapidJSON_INCLUDE_DIR) - set(RAPIDJSON_INCLUDE_DIR "${RapidJSON_INCLUDE_DIR}") - endif() -+else() -+ find_package(RapidJSON REQUIRED) -+ set(RAPIDJSON_INCLUDE_DIR "${RapidJSON_INCLUDE_DIR}") -+endif() - - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) -@@ -2036,10 +2063,21 @@ macro(build_xsimd) - set(XSIMD_VENDORED TRUE) - endmacro() - --if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" -- )) -+if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE")) -+ -+ if (0) -+ - set(xsimd_SOURCE "BUNDLED") - resolve_dependency(xsimd) -+ -+ else() -+ -+ find_package(xsimd) -+ set(XSIMD_INCLUDE_DIR "${xsimd_INCLUDE_DIR}") -+ add_dependencies(toolchain xsimd) -+ -+ endif() -+ - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) - endif() -@@ -2082,11 +2120,14 @@ macro(build_zlib) - endmacro() - - if(ARROW_WITH_ZLIB) -+ if(0) - resolve_dependency(ZLIB PC_PACKAGE_NAMES zlib) - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(ZLIB_INCLUDE_DIR ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${ZLIB_INCLUDE_DIR}) -+ endif() -+ find_package(ZLIB REQUIRED) - endif() - - macro(build_lz4) -@@ -2140,11 +2181,14 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -+ if(0) - resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+ endif() -+ find_package(lz4 REQUIRED) - endif() - - macro(build_zstd) -@@ -2205,6 +2249,7 @@ macro(build_zstd) - endmacro() - - if(ARROW_WITH_ZSTD) -+ if(0) - # ARROW-13384: ZSTD_minCLevel was added in v1.4.0, required by ARROW-13091 - resolve_dependency(zstd - PC_PACKAGE_NAMES -@@ -2232,6 +2277,8 @@ if(ARROW_WITH_ZSTD) - get_target_property(ZSTD_INCLUDE_DIR ${ARROW_ZSTD_LIBZSTD} - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${ZSTD_INCLUDE_DIR}) -+ endif() -+ find_package(zstd REQUIRED) - endif() - - # ---------------------------------------------------------------------- -@@ -2271,6 +2318,7 @@ macro(build_re2) - endmacro() - - if(ARROW_WITH_RE2) -+ if(0) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -@@ -2284,6 +2332,8 @@ if(ARROW_WITH_RE2) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) -+ endif() -+ find_package(re2 REQUIRED) - endif() - - macro(build_bzip2) -@@ -2335,10 +2385,7 @@ macro(build_bzip2) - endmacro() - - if(ARROW_WITH_BZ2) -- resolve_dependency(BZip2) -- if(${BZip2_SOURCE} STREQUAL "SYSTEM") -- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -- endif() -+ find_package(BZip2 REQUIRED) - - if(NOT TARGET BZip2::BZip2) - add_library(BZip2::BZip2 UNKNOWN IMPORTED) -@@ -2390,11 +2437,7 @@ macro(build_utf8proc) - endmacro() - - if(ARROW_WITH_UTF8PROC) -- resolve_dependency(utf8proc -- REQUIRED_VERSION -- "2.2.0" -- PC_PACKAGE_NAMES -- libutf8proc) -+ find_package(utf8proc REQUIRED CONFIG) - - add_definitions(-DARROW_WITH_UTF8PROC) - -@@ -3554,33 +3597,12 @@ if(ARROW_WITH_GRPC) - message(STATUS "Forcing gRPC_SOURCE to Protobuf_SOURCE (${Protobuf_SOURCE})") - set(gRPC_SOURCE "${Protobuf_SOURCE}") - endif() -- resolve_dependency(gRPC -- HAVE_ALT -- TRUE -- REQUIRED_VERSION -- ${ARROW_GRPC_REQUIRED_VERSION} -- PC_PACKAGE_NAMES -- grpc++) -+ find_package(gRPC CONFIG REQUIRED) - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(GRPC_INCLUDE_DIR gRPC::grpc++ INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) - -- if(GRPC_VENDORED) -- set(GRPCPP_PP_INCLUDE TRUE) -- # Examples need to link to static Arrow if we're using static gRPC -- set(ARROW_GRPC_USE_SHARED OFF) -- else() -- # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp -- # depending on the gRPC version. -- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") -- set(GRPCPP_PP_INCLUDE TRUE) -- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") -- set(GRPCPP_PP_INCLUDE FALSE) -- else() -- message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") -- endif() -- endif() - endif() - - # ---------------------------------------------------------------------- -@@ -3770,7 +3792,12 @@ macro(build_google_cloud_cpp_storage) - endmacro() - - if(ARROW_WITH_GOOGLE_CLOUD_CPP) -+if(0) - resolve_dependency(google_cloud_cpp_storage) -+else() -+ find_package(google-cloud-cpp REQUIRED) -+endif() -+ - get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) -@@ -4097,11 +4124,15 @@ macro(build_opentelemetry) - endmacro() - - if(ARROW_WITH_OPENTELEMETRY) -+if(0) - set(opentelemetry-cpp_SOURCE "AUTO") - resolve_dependency(opentelemetry-cpp) - get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) -+else() -+ find_package(opentelemetry-cpp REQUIRED) -+endif() - message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") - endif() - -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 84975e2..7779c08 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -575,6 +575,10 @@ foreach(LIB_TARGET ${ARROW_LIBRARIES}) - target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - endforeach() - -+if(ARROW_BUILD_SHARED AND WIN32) -+ target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - -@@ -585,6 +589,7 @@ if(ARROW_WITH_BACKTRACE) - endforeach() - endif() - -+if(0) - if(ARROW_BUILD_BUNDLED_DEPENDENCIES) - arrow_car(_FIRST_LIB ${ARROW_BUNDLED_STATIC_LIBS}) - arrow_cdr(_OTHER_LIBS ${ARROW_BUNDLED_STATIC_LIBS}) -@@ -596,6 +601,7 @@ if(ARROW_BUILD_BUNDLED_DEPENDENCIES) - TO_MERGE - ${_OTHER_LIBS}) - endif() -+endif() - - if(ARROW_TESTING) - # that depend on gtest diff --git a/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch b/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch new file mode 100644 index 00000000000..eb2acb1523f --- /dev/null +++ b/ci/conan/all/patches/7.0.0-0007-fix-cmake.patch @@ -0,0 +1,369 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index 2d7baf1..dff5b1a 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -699,7 +699,7 @@ endif() + + if(ARROW_WITH_BROTLI) + # Order is important for static linking +- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) ++ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) + list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) + if(Brotli_SOURCE STREQUAL "SYSTEM") +@@ -715,10 +715,17 @@ if(ARROW_WITH_BZ2) + endif() + + if(ARROW_WITH_LZ4) +- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) +- if(Lz4_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) +- endif() ++ if (TARGET LZ4::lz4_static) ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) ++ if(Lz4_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) ++ endif() ++ else() ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) ++ if(Lz4_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) ++ endif() ++endif() + endif() + + if(ARROW_WITH_SNAPPY) +@@ -907,8 +914,13 @@ endif() + + if(ARROW_MIMALLOC) + add_definitions(-DARROW_MIMALLOC) +- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) +- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) ++ if (TARGET mimalloc-static) ++ list(APPEND ARROW_LINK_LIBS mimalloc-static) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) ++ else() ++ list(APPEND ARROW_LINK_LIBS mimalloc) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) ++ endif() + endif() + + # ---------------------------------------------------------------------- +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index bc38952..62bf314 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -954,7 +954,7 @@ endif() + + if(ARROW_BOOST_REQUIRED) + resolve_dependency(Boost +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_BOOST_REQUIRED_VERSION} +@@ -965,7 +965,7 @@ if(ARROW_BOOST_REQUIRED) + if(TARGET Boost::system) + set(BOOST_SYSTEM_LIBRARY Boost::system) + set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) +- elseif(BoostAlt_FOUND) ++ elseif(Boost_FOUND) + set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) + set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) + else() +@@ -1108,9 +1108,9 @@ macro(build_brotli) + endmacro() + + if(ARROW_WITH_BROTLI) +- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) ++ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon ++ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) + endif() +@@ -1302,22 +1302,17 @@ endmacro() + if(ARROW_NEED_GFLAGS) + set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") + resolve_dependency(gflags +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GFLAGS_REQUIRED_VERSION} + IS_RUNTIME_DEPENDENCY + FALSE) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) ++ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) + +- if(NOT TARGET ${GFLAGS_LIBRARIES}) +- if(TARGET gflags-shared) +- set(GFLAGS_LIBRARIES gflags-shared) +- elseif(TARGET gflags_shared) +- set(GFLAGS_LIBRARIES gflags_shared) +- endif() +- endif() ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) ++ set(GFLAGS_LIBRARIES gflags::gflags) + endif() + + # ---------------------------------------------------------------------- +@@ -1411,9 +1406,9 @@ if(ARROW_WITH_THRIFT) + thrift) + endif() + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) ++ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) + +- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) ++ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) + list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) + list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) + list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) +@@ -1528,6 +1523,7 @@ if(ARROW_WITH_PROTOBUF) + set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") + endif() + resolve_dependency(Protobuf ++ USE_CONFIG + REQUIRED_VERSION + ${ARROW_PROTOBUF_REQUIRED_VERSION} + PC_PACKAGE_NAMES +@@ -1538,7 +1534,7 @@ if(ARROW_WITH_PROTOBUF) + endif() + + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) ++ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) + + if(TARGET arrow::protobuf::libprotobuf) + set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) +@@ -1547,9 +1543,9 @@ if(ARROW_WITH_PROTOBUF) + if(NOT TARGET protobuf::libprotobuf) + add_library(protobuf::libprotobuf UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf +- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" ++ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES +- "${PROTOBUF_INCLUDE_DIR}") ++ "${Protobuf_INCLUDE_DIR}") + endif() + set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) + endif() +@@ -1569,7 +1565,7 @@ if(ARROW_WITH_PROTOBUF) + set_target_properties(protobuf::libprotoc + PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES +- "${PROTOBUF_INCLUDE_DIR}") ++ "${Protobuf_INCLUDE_DIR}") + endif() + set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) + endif() +@@ -1600,6 +1596,7 @@ endif() + # jemalloc - Unix-only high-performance allocator + + if(ARROW_JEMALLOC) ++if(0) + message(STATUS "Building (vendored) jemalloc from source") + # We only use a vendored jemalloc as we want to control its version. + # Also our build of jemalloc is specially prefixed so that it will not +@@ -1665,12 +1662,18 @@ if(ARROW_JEMALLOC) + add_dependencies(jemalloc::jemalloc jemalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) ++else() ++ find_package(jemalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) ++endif() + endif() + + # ---------------------------------------------------------------------- + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++if(0) + message(STATUS "Building (vendored) mimalloc from source") + # We only use a vendored mimalloc as we want to control its build options. + +@@ -1716,6 +1719,11 @@ if(ARROW_MIMALLOC) + add_dependencies(toolchain mimalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) ++else() ++ find_package(mimalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- +@@ -2001,7 +2009,7 @@ endmacro() + if(ARROW_WITH_RAPIDJSON) + set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") + resolve_dependency(RapidJSON +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_RAPIDJSON_REQUIRED_VERSION} +@@ -2038,10 +2046,9 @@ endmacro() + + if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" + )) +- set(xsimd_SOURCE "BUNDLED") + resolve_dependency(xsimd) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) ++ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) + endif() + + macro(build_zlib) +@@ -2140,10 +2147,14 @@ macro(build_lz4) + endmacro() + + if(ARROW_WITH_LZ4) +- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) ++ resolve_dependency(lz4) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) ++ if (TARGET LZ4::lz4_static) ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) ++ else() ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) ++ endif() + include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) + endif() + +@@ -2274,7 +2285,7 @@ if(ARROW_WITH_RE2) + # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may + # include -std=c++11. It's not compatible with C source and C++ + # source not uses C++ 11. +- resolve_dependency(re2 HAVE_ALT TRUE) ++ resolve_dependency(re2 USE_CONFIG TRUE) + if(${re2_SOURCE} STREQUAL "SYSTEM") + get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION) + string(APPEND ARROW_PC_LIBS_PRIVATE " ${RE2_LIB}") +@@ -2337,7 +2348,7 @@ endmacro() + if(ARROW_WITH_BZ2) + resolve_dependency(BZip2) + if(${BZip2_SOURCE} STREQUAL "SYSTEM") +- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") ++ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") + endif() + + if(NOT TARGET BZip2::BZip2) +@@ -2346,7 +2357,7 @@ if(ARROW_WITH_BZ2) + PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") + endif() +- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") ++ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") + endif() + + macro(build_utf8proc) +@@ -3555,7 +3566,7 @@ if(ARROW_WITH_GRPC) + set(gRPC_SOURCE "${Protobuf_SOURCE}") + endif() + resolve_dependency(gRPC +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GRPC_REQUIRED_VERSION} +@@ -3573,9 +3584,9 @@ if(ARROW_WITH_GRPC) + else() + # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp + # depending on the gRPC version. +- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") ++ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE TRUE) +- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") ++ elseif(EXISTS ${gPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE FALSE) + else() + message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") +@@ -4097,9 +4108,9 @@ macro(build_opentelemetry) + endmacro() + + if(ARROW_WITH_OPENTELEMETRY) +- set(opentelemetry-cpp_SOURCE "AUTO") ++ set(opentelemetry-cpp_SOURCE "SYSTEM") + resolve_dependency(opentelemetry-cpp) +- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api ++ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) + message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") +diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt +index b984bc1..2c78cd9 100644 +--- a/cpp/src/arrow/CMakeLists.txt ++++ b/cpp/src/arrow/CMakeLists.txt +@@ -323,10 +323,14 @@ set(ARROW_TESTING_SRCS + + set(_allocator_dependencies "") # Empty list + if(ARROW_JEMALLOC) +- list(APPEND _allocator_dependencies jemalloc_ep) ++ list(APPEND _allocator_dependencies jemalloc::jemalloc) + endif() + if(ARROW_MIMALLOC) +- list(APPEND _allocator_dependencies mimalloc_ep) ++ if (TARGET mimalloc-static) ++ list(APPEND _allocator_dependencies mimalloc-static) ++ else() ++ list(APPEND _allocator_dependencies mimalloc) ++ endif() + endif() + + if(_allocator_dependencies) +diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt +index 2cf8c99..90ebb9a 100644 +--- a/cpp/src/arrow/flight/CMakeLists.txt ++++ b/cpp/src/arrow/flight/CMakeLists.txt +@@ -17,6 +17,9 @@ + + add_custom_target(arrow_flight) + ++# TODO: This is a temporary workaround. absl should be LINKED as TARGET. ++include_directories(SYSTEM ${absl_INCLUDE_DIR}) ++ + arrow_install_all_headers("arrow/flight") + + set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) +diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc +index 2dcfb01..0394c01 100644 +--- a/cpp/src/arrow/memory_pool.cc ++++ b/cpp/src/arrow/memory_pool.cc +@@ -48,7 +48,7 @@ + // Needed to support jemalloc 3 and 4 + #define JEMALLOC_MANGLE + // Explicitly link to our version of jemalloc +-#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" ++#include "jemalloc/jemalloc.h" + #endif + + #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/8.0.0-0002-jemalloc.patch b/ci/conan/all/patches/8.0.0-0002-jemalloc.patch deleted file mode 100644 index 99b92e3308f..00000000000 --- a/ci/conan/all/patches/8.0.0-0002-jemalloc.patch +++ /dev/null @@ -1,48 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index 690c51a..c518b7d 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -326,7 +326,7 @@ set(ARROW_TESTING_SRCS - - set(_allocator_dependencies "") # Empty list - if(ARROW_JEMALLOC) -- list(APPEND _allocator_dependencies jemalloc_ep) -+ list(APPEND _allocator_dependencies jemalloc::jemalloc) - endif() - if(ARROW_MIMALLOC) - list(APPEND _allocator_dependencies mimalloc_ep) -diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc -index 2fab6f3..1f8f896 100644 ---- a/cpp/src/arrow/memory_pool.cc -+++ b/cpp/src/arrow/memory_pool.cc -@@ -52,7 +52,7 @@ - // Needed to support jemalloc 3 and 4 - #define JEMALLOC_MANGLE - // Explicitly link to our version of jemalloc --#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" -+#include "jemalloc/jemalloc.h" - #endif - - #ifdef ARROW_MIMALLOC diff --git a/ci/conan/all/patches/8.0.0-0004-use-find-package.patch b/ci/conan/all/patches/8.0.0-0004-use-find-package.patch deleted file mode 100644 index e7bc2320c76..00000000000 --- a/ci/conan/all/patches/8.0.0-0004-use-find-package.patch +++ /dev/null @@ -1,401 +0,0 @@ -MIT License - -Copyright (c) 2019 Conan.io - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt -index aba18c8..bb463d0 100644 ---- a/cpp/CMakeLists.txt -+++ b/cpp/CMakeLists.txt -@@ -721,7 +721,7 @@ if(ARROW_WITH_BZ2) - endif() - - if(ARROW_WITH_LZ4) -- list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) -+ list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) - if(Lz4_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) - endif() -@@ -907,8 +907,8 @@ endif() - if(ARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC) - add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR}) -- list(APPEND ARROW_LINK_LIBS jemalloc::jemalloc) -- list(APPEND ARROW_STATIC_LINK_LIBS jemalloc::jemalloc) -+ list(APPEND ARROW_LINK_LIBS jemalloc) -+ list(APPEND ARROW_STATIC_LINK_LIBS jemalloc) - endif() - - if(ARROW_MIMALLOC) -diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake -index f070323..2e2a03b 100644 ---- a/cpp/cmake_modules/ThirdpartyToolchain.cmake -+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake -@@ -974,6 +974,7 @@ else() - endif() - - if(ARROW_BOOST_REQUIRED) -+if(0) - resolve_dependency(Boost - HAVE_ALT - TRUE -@@ -982,6 +983,9 @@ if(ARROW_BOOST_REQUIRED) - IS_RUNTIME_DEPENDENCY - # libarrow.so doesn't depend on libboost*. - FALSE) -+else() -+ find_package(Boost REQUIRED CONFIG) -+endif() - - if(TARGET Boost::system) - set(BOOST_SYSTEM_LIBRARY Boost::system) -@@ -1059,6 +1063,7 @@ macro(build_snappy) - endmacro() - - if(ARROW_WITH_SNAPPY) -+if(0) - resolve_dependency(Snappy PC_PACKAGE_NAMES snappy) - if(${Snappy_SOURCE} STREQUAL "SYSTEM" AND NOT snappy_PC_FOUND) - get_target_property(SNAPPY_LIB Snappy::snappy IMPORTED_LOCATION) -@@ -1067,6 +1072,9 @@ if(ARROW_WITH_SNAPPY) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(SNAPPY_INCLUDE_DIRS Snappy::snappy INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${SNAPPY_INCLUDE_DIRS}) -+else() -+ find_package(Snappy REQUIRED) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1129,7 +1137,7 @@ macro(build_brotli) - endmacro() - - if(ARROW_WITH_BROTLI) -- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) -+ find_package(Brotli REQUIRED) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon - INTERFACE_INCLUDE_DIRECTORIES) -@@ -1169,8 +1177,16 @@ if(PARQUET_REQUIRE_ENCRYPTION - set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS ON) - -- find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) -- set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) -+ find_package(OpenSSL REQUIRED CONFIG) -+ message("OPENSSL_FOUND: ${OPENSSL_FOUND}") -+ message("OPENSSL_INCLUDE_DIR: ${OPENSSL_INCLUDE_DIR}") -+ message("OPENSSL_CRYPTO_LIBRARY: ${OPENSSL_CRYPTO_LIBRARY}") -+ message("OPENSSL_CRYPTO_LIBRARIES: ${OPENSSL_CRYPTO_LIBRARIES}") -+ message("OPENSSL_SSL_LIBRARY: ${OPENSSL_SSL_LIBRARY}") -+ message("OPENSSL_SSL_LIBRARIES: ${OPENSSL_SSL_LIBRARIES}") -+ message("OPENSSL_LIBRARIES: ${OPENSSL_LIBRARIES}") -+ message("OPENSSL_VERSION: ${OPENSSL_VERSION}") -+ set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) - unset(BUILD_SHARED_LIBS_KEEP) - else() - # Find static OpenSSL headers and libs -@@ -1249,10 +1265,14 @@ macro(build_glog) - endmacro() - - if(ARROW_USE_GLOG) -+if(0) - resolve_dependency(GLOG PC_PACKAGE_NAMES libglog) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(GLOG_INCLUDE_DIR glog::glog INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) -+else() -+ find_package(glog REQUIRED) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1321,6 +1341,7 @@ macro(build_gflags) - endmacro() - - if(ARROW_NEED_GFLAGS) -+if(0) - set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") - resolve_dependency(gflags - HAVE_ALT -@@ -1339,6 +1360,10 @@ if(ARROW_NEED_GFLAGS) - set(GFLAGS_LIBRARIES gflags_shared) - endif() - endif() -+else() -+ find_package(gflags REQUIRED) -+ set(GFLAGS_LIBRARIES gflags::gflags) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -1718,6 +1756,7 @@ if(ARROW_JEMALLOC) - # installations. - # find_package(jemalloc) - -+if(0) - set(ARROW_JEMALLOC_USE_SHARED OFF) - set(JEMALLOC_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src/jemalloc_ep/dist/") -@@ -1778,6 +1817,9 @@ if(ARROW_JEMALLOC) - INTERFACE_INCLUDE_DIRECTORIES - "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src") - add_dependencies(jemalloc::jemalloc jemalloc_ep) -+else() -+ find_package(jemalloc REQUIRED) -+endif() - - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) - endif() -@@ -1786,6 +1828,7 @@ endif() - # mimalloc - Cross-platform high-performance allocator, from Microsoft - - if(ARROW_MIMALLOC) -+if(0) - message(STATUS "Building (vendored) mimalloc from source") - # We only use a vendored mimalloc as we want to control its build options. - -@@ -1834,6 +1877,10 @@ if(ARROW_MIMALLOC) - endif() - add_dependencies(mimalloc::mimalloc mimalloc_ep) - add_dependencies(toolchain mimalloc_ep) -+else() -+ find_package(mimalloc REQUIRED CONFIG) -+ add_dependencies(toolchain mimalloc::mimalloc) -+endif() - - list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) - endif() -@@ -2119,6 +2166,7 @@ macro(build_rapidjson) - endmacro() - - if(ARROW_WITH_RAPIDJSON) -+if(0) - set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") - resolve_dependency(RapidJSON - HAVE_ALT -@@ -2131,6 +2179,10 @@ if(ARROW_WITH_RAPIDJSON) - if(RapidJSON_INCLUDE_DIR) - set(RAPIDJSON_INCLUDE_DIR "${RapidJSON_INCLUDE_DIR}") - endif() -+else() -+ find_package(RapidJSON REQUIRED) -+ set(RAPIDJSON_INCLUDE_DIR "${RapidJSON_INCLUDE_DIR}") -+endif() - - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) -@@ -2158,8 +2210,14 @@ endmacro() - - if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" - )) -+if(0) - set(xsimd_SOURCE "BUNDLED") - resolve_dependency(xsimd) -+else() -+ find_package(xsimd) -+ set(XSIMD_INCLUDE_DIR "${xsimd_INCLUDE_DIR}") -+ add_dependencies(toolchain xsimd) -+endif() - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) - endif() -@@ -2202,11 +2260,15 @@ macro(build_zlib) - endmacro() - - if(ARROW_WITH_ZLIB) -+if(0) - resolve_dependency(ZLIB PC_PACKAGE_NAMES zlib) - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(ZLIB_INCLUDE_DIR ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${ZLIB_INCLUDE_DIR}) -+else() -+ find_package(ZLIB REQUIRED) -+endif() - endif() - - macro(build_lz4) -@@ -2260,11 +2322,15 @@ macro(build_lz4) - endmacro() - - if(ARROW_WITH_LZ4) -+if(0) - resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) -+else() -+ find_package(lz4 REQUIRED) -+endif() - endif() - - macro(build_zstd) -@@ -2325,6 +2391,7 @@ macro(build_zstd) - endmacro() - - if(ARROW_WITH_ZSTD) -+if(0) - # ARROW-13384: ZSTD_minCLevel was added in v1.4.0, required by ARROW-13091 - resolve_dependency(zstd - PC_PACKAGE_NAMES -@@ -2352,6 +2419,9 @@ if(ARROW_WITH_ZSTD) - get_target_property(ZSTD_INCLUDE_DIR ${ARROW_ZSTD_LIBZSTD} - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${ZSTD_INCLUDE_DIR}) -+else() -+ find_package(zstd REQUIRED) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -2391,6 +2461,7 @@ macro(build_re2) - endmacro() - - if(ARROW_WITH_RE2) -+if(0) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. -@@ -2411,6 +2482,9 @@ if(ARROW_WITH_RE2) - # TODO: Don't use global includes but rather target_include_directories - get_target_property(RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${RE2_INCLUDE_DIR}) -+else() -+ find_package(re2 REQUIRED) -+endif() - endif() - - macro(build_bzip2) -@@ -2462,6 +2536,7 @@ macro(build_bzip2) - endmacro() - - if(ARROW_WITH_BZ2) -+if(0) - resolve_dependency(BZip2) - if(${BZip2_SOURCE} STREQUAL "SYSTEM") - string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") -@@ -2474,6 +2549,9 @@ if(ARROW_WITH_BZ2) - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() - include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") -+else() -+ find_package(BZip2 REQUIRED) -+endif() - endif() - - macro(build_utf8proc) -@@ -2517,6 +2595,7 @@ macro(build_utf8proc) - endmacro() - - if(ARROW_WITH_UTF8PROC) -+if(0) - resolve_dependency(utf8proc - REQUIRED_VERSION - "2.2.0" -@@ -2538,6 +2617,10 @@ if(ARROW_WITH_UTF8PROC) - get_target_property(UTF8PROC_INCLUDE_DIR utf8proc::utf8proc - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${UTF8PROC_INCLUDE_DIR}) -+else() -+ find_package(utf8proc REQUIRED CONFIG) -+ add_definitions(-DARROW_WITH_UTF8PROC) -+endif() - endif() - - macro(build_cares) -@@ -3702,6 +3785,7 @@ macro(build_grpc) - endmacro() - - if(ARROW_WITH_GRPC) -+if(0) - set(ARROW_GRPC_REQUIRED_VERSION "1.17.0") - if(NOT Protobuf_SOURCE STREQUAL gRPC_SOURCE) - # ARROW-15495: Protobuf/gRPC must come from the same source -@@ -3735,6 +3819,9 @@ if(ARROW_WITH_GRPC) - message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") - endif() - endif() -+else() -+ find_package(gRPC REQUIRED CONFIG) -+endif() - endif() - - # ---------------------------------------------------------------------- -@@ -3937,10 +4024,14 @@ macro(build_google_cloud_cpp_storage) - endmacro() - - if(ARROW_WITH_GOOGLE_CLOUD_CPP) -+if(0) - resolve_dependency(google_cloud_cpp_storage) - get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) -+else() -+ find_package(google-cloud-cpp REQUIRED) -+endif() - get_target_property(absl_base_INCLUDE_DIR absl::base INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${absl_base_INCLUDE_DIR}) - message(STATUS "Found google-cloud-cpp::storage headers: ${google_cloud_cpp_storage_INCLUDE_DIR}" -@@ -4261,6 +4352,7 @@ macro(build_opentelemetry) - endmacro() - - if(ARROW_WITH_OPENTELEMETRY) -+if(0) - # cURL is required whether we build from source or use an existing installation - # (OTel's cmake files do not call find_curl for you) - find_curl() -@@ -4269,7 +4361,10 @@ if(ARROW_WITH_OPENTELEMETRY) - get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) -- message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") -+else() -+ find_package(opentelemetry-cpp REQUIRED) -+endif() -+ message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") - endif() - - # ---------------------------------------------------------------------- -diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt -index c518b7d..40b4853 100644 ---- a/cpp/src/arrow/CMakeLists.txt -+++ b/cpp/src/arrow/CMakeLists.txt -@@ -584,6 +584,10 @@ foreach(LIB_TARGET ${ARROW_LIBRARIES}) - target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - endforeach() - -+if(ARROW_BUILD_SHARED AND WIN32) -+ target_compile_definitions(arrow_shared PRIVATE ARROW_EXPORTING) -+endif() -+ - if(ARROW_WITH_BACKTRACE) - find_package(Backtrace) - -@@ -594,7 +598,7 @@ if(ARROW_WITH_BACKTRACE) - endforeach() - endif() - --if(ARROW_BUILD_BUNDLED_DEPENDENCIES) -+if(0) - arrow_car(_FIRST_LIB ${ARROW_BUNDLED_STATIC_LIBS}) - arrow_cdr(_OTHER_LIBS ${ARROW_BUNDLED_STATIC_LIBS}) - create_merged_static_lib(arrow_bundled_dependencies diff --git a/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch b/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch new file mode 100644 index 00000000000..7be516e1b48 --- /dev/null +++ b/ci/conan/all/patches/8.0.0-0006-fix-cmake.patch @@ -0,0 +1,447 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt +index bb463d0..ce2d1df 100644 +--- a/cpp/CMakeLists.txt ++++ b/cpp/CMakeLists.txt +@@ -705,7 +705,7 @@ endif() + + if(ARROW_WITH_BROTLI) + # Order is important for static linking +- set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) ++ set(ARROW_BROTLI_LIBS brotli::brotlienc brotli::brotlidec brotli::brotlicommon) + list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) + if(Brotli_SOURCE STREQUAL "SYSTEM") +@@ -721,11 +721,18 @@ if(ARROW_WITH_BZ2) + endif() + + if(ARROW_WITH_LZ4) +- list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) +- if(Lz4_SOURCE STREQUAL "SYSTEM") +- list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) ++ if (TARGET LZ4::lz4_static) ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_static) ++ if(Lz4_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_static) ++ endif() ++ else() ++ list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4_shared) ++ if(Lz4_SOURCE STREQUAL "SYSTEM") ++ list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4_shared) + endif() + endif() ++endif() + + if(ARROW_WITH_SNAPPY) + list(APPEND ARROW_STATIC_LINK_LIBS Snappy::snappy) +@@ -913,8 +920,13 @@ endif() + + if(ARROW_MIMALLOC) + add_definitions(-DARROW_MIMALLOC) +- list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) +- list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) ++ if (TARGET mimalloc-static) ++ list(APPEND ARROW_LINK_LIBS mimalloc-static) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc-static) ++ else() ++ list(APPEND ARROW_LINK_LIBS mimalloc) ++ list(APPEND ARROW_STATIC_LINK_LIBS mimalloc) ++ endif() + endif() + + # ---------------------------------------------------------------------- +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index f070323..16faf73 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -959,6 +959,7 @@ endif() + # - Tests need Boost at runtime. + # - S3FS and Flight benchmarks need Boost at runtime. + if(ARROW_BUILD_INTEGRATION ++ OR ARROW_BOOST_REQUIRED + OR ARROW_BUILD_TESTS + OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) + OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) +@@ -975,7 +976,7 @@ endif() + + if(ARROW_BOOST_REQUIRED) + resolve_dependency(Boost +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_BOOST_REQUIRED_VERSION} +@@ -986,7 +987,7 @@ if(ARROW_BOOST_REQUIRED) + if(TARGET Boost::system) + set(BOOST_SYSTEM_LIBRARY Boost::system) + set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) +- elseif(BoostAlt_FOUND) ++ elseif(Boost_FOUND) + set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) + set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) + else() +@@ -1129,9 +1130,9 @@ macro(build_brotli) + endmacro() + + if(ARROW_WITH_BROTLI) +- resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) ++ resolve_dependency(brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon ++ get_target_property(BROTLI_INCLUDE_DIR brotli::brotlicommon + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) + endif() +@@ -1323,22 +1324,16 @@ endmacro() + if(ARROW_NEED_GFLAGS) + set(ARROW_GFLAGS_REQUIRED_VERSION "2.1.0") + resolve_dependency(gflags +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GFLAGS_REQUIRED_VERSION} + IS_RUNTIME_DEPENDENCY + FALSE) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) +- +- if(NOT TARGET ${GFLAGS_LIBRARIES}) +- if(TARGET gflags-shared) +- set(GFLAGS_LIBRARIES gflags-shared) +- elseif(TARGET gflags_shared) +- set(GFLAGS_LIBRARIES gflags_shared) +- endif() +- endif() ++ include_directories(SYSTEM ${gflags_INCLUDE_DIR}) ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${gflags_LIBRARIES_TARGETS}) ++ set(GFLAGS_LIBRARIES gflags::gflags) + endif() + + # ---------------------------------------------------------------------- +@@ -1432,9 +1427,9 @@ if(ARROW_WITH_THRIFT) + thrift) + endif() + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) ++ include_directories(SYSTEM ${Thrift_INCLUDE_DIR}) + +- string(REPLACE "." ";" VERSION_LIST ${THRIFT_VERSION}) ++ string(REPLACE "." ";" VERSION_LIST ${Thrift_VERSION}) + list(GET VERSION_LIST 0 THRIFT_VERSION_MAJOR) + list(GET VERSION_LIST 1 THRIFT_VERSION_MINOR) + list(GET VERSION_LIST 2 THRIFT_VERSION_PATCH) +@@ -1557,6 +1552,7 @@ if(ARROW_WITH_PROTOBUF) + set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") + endif() + resolve_dependency(Protobuf ++ USE_CONFIG + REQUIRED_VERSION + ${ARROW_PROTOBUF_REQUIRED_VERSION} + PC_PACKAGE_NAMES +@@ -1567,7 +1563,7 @@ if(ARROW_WITH_PROTOBUF) + endif() + + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) ++ include_directories(SYSTEM ${protobuf_INCLUDE_DIR}) + + if(TARGET arrow::protobuf::libprotobuf) + set(ARROW_PROTOBUF_LIBPROTOBUF arrow::protobuf::libprotobuf) +@@ -1576,9 +1572,9 @@ if(ARROW_WITH_PROTOBUF) + if(NOT TARGET protobuf::libprotobuf) + add_library(protobuf::libprotobuf UNKNOWN IMPORTED) + set_target_properties(protobuf::libprotobuf +- PROPERTIES IMPORTED_LOCATION "${PROTOBUF_LIBRARY}" ++ PROPERTIES IMPORTED_LOCATION "${Protobuf_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES +- "${PROTOBUF_INCLUDE_DIR}") ++ "${Protobuf_INCLUDE_DIR}") + endif() + set(ARROW_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) + endif() +@@ -1598,7 +1594,7 @@ if(ARROW_WITH_PROTOBUF) + set_target_properties(protobuf::libprotoc + PROPERTIES IMPORTED_LOCATION "${Protobuf_PROTOC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES +- "${PROTOBUF_INCLUDE_DIR}") ++ "${Protobuf_INCLUDE_DIR}") + endif() + set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) + endif() +@@ -1690,11 +1686,12 @@ macro(build_substrait) + + add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL}) + +- set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${PROTOBUF_INCLUDE_DIR}) ++ set(SUBSTRAIT_INCLUDES ${SUBSTRAIT_CPP_DIR} ${protobuf_INCLUDE_DIR}) + + add_library(substrait STATIC ${SUBSTRAIT_SOURCES}) + set_target_properties(substrait PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_include_directories(substrait PUBLIC ${SUBSTRAIT_INCLUDES}) ++ target_include_directories(substrait PUBLIC ${PROTOBUF_INCLUDE_DIR}) + target_link_libraries(substrait INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) + add_dependencies(substrait substrait_gen) + +@@ -1711,6 +1708,7 @@ endif() + # jemalloc - Unix-only high-performance allocator + + if(ARROW_JEMALLOC) ++if(0) + message(STATUS "Building (vendored) jemalloc from source") + # We only use a vendored jemalloc as we want to control its version. + # Also our build of jemalloc is specially prefixed so that it will not +@@ -1780,12 +1778,18 @@ if(ARROW_JEMALLOC) + add_dependencies(jemalloc::jemalloc jemalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) ++else() ++ find_package(jemalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${jemalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${jemalloc_LIBRARIES_TARGETS}) ++endif() + endif() + + # ---------------------------------------------------------------------- + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++if(0) + message(STATUS "Building (vendored) mimalloc from source") + # We only use a vendored mimalloc as we want to control its build options. + +@@ -1836,6 +1840,11 @@ if(ARROW_MIMALLOC) + add_dependencies(toolchain mimalloc_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS mimalloc::mimalloc) ++else() ++ find_package(mimalloc REQUIRED CONFIG) ++ include_directories(SYSTEM "${mimalloc_INCLUDE_DIR}") ++ list(APPEND ARROW_BUNDLED_STATIC_LIBS ${mimalloc_LIBRARIES_TARGETS} ) ++endif() + endif() + + # ---------------------------------------------------------------------- +@@ -2121,7 +2130,7 @@ endmacro() + if(ARROW_WITH_RAPIDJSON) + set(ARROW_RAPIDJSON_REQUIRED_VERSION "1.1.0") + resolve_dependency(RapidJSON +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_RAPIDJSON_REQUIRED_VERSION} +@@ -2158,10 +2167,10 @@ endmacro() + + if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" + )) +- set(xsimd_SOURCE "BUNDLED") ++ set(xsimd_SOURCE "SYSTEM") + resolve_dependency(xsimd) + # TODO: Don't use global includes but rather target_include_directories +- include_directories(SYSTEM ${XSIMD_INCLUDE_DIR}) ++ include_directories(SYSTEM ${xsimd_INCLUDE_DIR}) + endif() + + macro(build_zlib) +@@ -2260,10 +2269,14 @@ macro(build_lz4) + endmacro() + + if(ARROW_WITH_LZ4) +- resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) ++ resolve_dependency(Lz4) + + # TODO: Don't use global includes but rather target_include_directories +- get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) ++ if (TARGET LZ4::lz4_static) ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_static INTERFACE_INCLUDE_DIRECTORIES) ++ else() ++ get_target_property(LZ4_INCLUDE_DIR LZ4::lz4_shared INTERFACE_INCLUDE_DIRECTORIES) ++ endif() + include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) + endif() + +@@ -2394,7 +2407,7 @@ if(ARROW_WITH_RE2) + # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may + # include -std=c++11. It's not compatible with C source and C++ + # source not uses C++ 11. +- resolve_dependency(re2 HAVE_ALT TRUE) ++ resolve_dependency(re2 USE_CONFIG TRUE) + if(${re2_SOURCE} STREQUAL "SYSTEM") + get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION_${UPPERCASE_BUILD_TYPE}) + if(NOT RE2_LIB) +@@ -2464,7 +2477,7 @@ endmacro() + if(ARROW_WITH_BZ2) + resolve_dependency(BZip2) + if(${BZip2_SOURCE} STREQUAL "SYSTEM") +- string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") ++ string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZip2_LIBRARIES}") + endif() + + if(NOT TARGET BZip2::BZip2) +@@ -2473,7 +2486,7 @@ if(ARROW_WITH_BZ2) + PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") + endif() +- include_directories(SYSTEM "${BZIP2_INCLUDE_DIR}") ++ include_directories(SYSTEM "${BZip2_INCLUDE_DIR}") + endif() + + macro(build_utf8proc) +@@ -3709,7 +3722,7 @@ if(ARROW_WITH_GRPC) + set(gRPC_SOURCE "${Protobuf_SOURCE}") + endif() + resolve_dependency(gRPC +- HAVE_ALT ++ USE_CONFIG + TRUE + REQUIRED_VERSION + ${ARROW_GRPC_REQUIRED_VERSION} +@@ -3727,9 +3740,9 @@ if(ARROW_WITH_GRPC) + else() + # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp + # depending on the gRPC version. +- if(EXISTS "${GRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h") ++ if(EXISTS ${gRPC_INCLUDE_DIR}/grpcpp/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE TRUE) +- elseif(EXISTS "${GRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h") ++ elseif(EXISTS ${gRPC_INCLUDE_DIR}/grpc++/impl/codegen/config_protobuf.h) + set(GRPCPP_PP_INCLUDE FALSE) + else() + message(FATAL_ERROR "Cannot find grpc++ headers in ${GRPC_INCLUDE_DIR}") +@@ -3937,7 +3950,7 @@ macro(build_google_cloud_cpp_storage) + endmacro() + + if(ARROW_WITH_GOOGLE_CLOUD_CPP) +- resolve_dependency(google_cloud_cpp_storage) ++ resolve_dependency(google_cloud_cpp) + get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) +@@ -4264,9 +4277,9 @@ if(ARROW_WITH_OPENTELEMETRY) + # cURL is required whether we build from source or use an existing installation + # (OTel's cmake files do not call find_curl for you) + find_curl() +- set(opentelemetry-cpp_SOURCE "AUTO") ++ set(opentelemetry-cpp_SOURCE "SYSTEM") + resolve_dependency(opentelemetry-cpp) +- get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api ++ get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::opentelemetry_common + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) + message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") +diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt +index 690c51a..752f3b9 100644 +--- a/cpp/src/arrow/CMakeLists.txt ++++ b/cpp/src/arrow/CMakeLists.txt +@@ -326,10 +326,14 @@ set(ARROW_TESTING_SRCS + + set(_allocator_dependencies "") # Empty list + if(ARROW_JEMALLOC) +- list(APPEND _allocator_dependencies jemalloc_ep) ++ list(APPEND _allocator_dependencies jemalloc::jemalloc) + endif() + if(ARROW_MIMALLOC) +- list(APPEND _allocator_dependencies mimalloc_ep) ++ if (TARGET mimalloc-static) ++ list(APPEND _allocator_dependencies mimalloc-static) ++ else() ++ list(APPEND _allocator_dependencies mimalloc) ++ endif() + endif() + + if(_allocator_dependencies) +diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt +index f9d1356..c9bcf79 100644 +--- a/cpp/src/arrow/flight/CMakeLists.txt ++++ b/cpp/src/arrow/flight/CMakeLists.txt +@@ -17,6 +17,9 @@ + + add_custom_target(arrow_flight) + ++# TODO: This is a temporary workaround. absl should be LINKED as TARGET. ++include_directories(SYSTEM ${absl_INCLUDE_DIR}) ++ + arrow_install_all_headers("arrow/flight") + + set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++ ${ARROW_PROTOBUF_LIBPROTOBUF}) +diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc +index ed1c2d8..37a89da 100644 +--- a/cpp/src/arrow/memory_pool.cc ++++ b/cpp/src/arrow/memory_pool.cc +@@ -52,7 +52,7 @@ + // Needed to support jemalloc 3 and 4 + #define JEMALLOC_MANGLE + // Explicitly link to our version of jemalloc +-#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h" ++#include "jemalloc/jemalloc.h" + #endif + + #ifdef ARROW_MIMALLOC +diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt +index 71faf9a..3aabea1 100644 +--- a/cpp/src/gandiva/CMakeLists.txt ++++ b/cpp/src/gandiva/CMakeLists.txt +@@ -25,7 +25,7 @@ add_custom_target(gandiva-benchmarks) + + add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) + +-find_package(LLVMAlt REQUIRED) ++find_package(LLVM REQUIRED) + + if(LLVM_VERSION_MAJOR LESS "10") + set(GANDIVA_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +@@ -40,7 +40,7 @@ endif() + + add_definitions(-DGANDIVA_LLVM_VERSION=${LLVM_VERSION_MAJOR}) + +-find_package(OpenSSLAlt REQUIRED) ++find_package(OpenSSL REQUIRED) + + # Set the path where the bitcode file generated, see precompiled/CMakeLists.txt + set(GANDIVA_PRECOMPILED_BC_PATH "${CMAKE_CURRENT_BINARY_DIR}/irhelpers.bc") +@@ -98,10 +98,11 @@ set(SRC_FILES + random_generator_holder.cc + ${GANDIVA_PRECOMPILED_CC_PATH}) + +-set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared LLVM::LLVM_INTERFACE +- ${GANDIVA_OPENSSL_LIBS}) ++set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared llvm-core::llvm-core NTERFACE ++ ${GANDIVA_OPENSSL_LIBS}) ++ ++set(GANDIVA_STATIC_LINK_LIBS arrow_static llvm-core::llvm-core ${GANDIVA_OPENSSL_LIBS}) + +-set(GANDIVA_STATIC_LINK_LIBS arrow_static LLVM::LLVM_INTERFACE ${GANDIVA_OPENSSL_LIBS}) + + if(ARROW_GANDIVA_STATIC_LIBSTDCPP AND (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX + )) +@@ -139,7 +140,7 @@ add_arrow_lib(gandiva + arrow_dependencies + precompiled + EXTRA_INCLUDES +- $ ++ $ + ${GANDIVA_OPENSSL_INCLUDE_DIR} + ${UTF8PROC_INCLUDE_DIR} + SHARED_LINK_FLAGS diff --git a/ci/conan/all/test_package/CMakeLists.txt b/ci/conan/all/test_package/CMakeLists.txt index 0df89423c14..18761d0f52c 100644 --- a/ci/conan/all/test_package/CMakeLists.txt +++ b/ci/conan/all/test_package/CMakeLists.txt @@ -21,14 +21,14 @@ # SOFTWARE. cmake_minimum_required(VERSION 3.8) -project(test_package) +project(test_package LANGUAGES CXX) -include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) -conan_basic_setup() - -find_package(Arrow REQUIRED) +find_package(Arrow REQUIRED CONFIG) add_executable(${PROJECT_NAME} test_package.cpp) -target_link_libraries(${PROJECT_NAME} arrow::arrow) -target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_11) -target_compile_definitions(${PROJECT_NAME} PRIVATE WITH_JEMALLOC) +target_link_libraries(${PROJECT_NAME} PRIVATE arrow::arrow) +if (${Arrow_VERSION} VERSION_LESS "10.0.0") + target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_11) +else() + target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) +endif() diff --git a/ci/conan/all/test_package/conanfile.py b/ci/conan/all/test_package/conanfile.py index 8bfa021563a..ce24052acb4 100644 --- a/ci/conan/all/test_package/conanfile.py +++ b/ci/conan/all/test_package/conanfile.py @@ -20,13 +20,23 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from conans import ConanFile, CMake, tools +from conan import ConanFile +from conan.tools.build import can_run +from conan.tools.cmake import cmake_layout, CMake import os +# It will become the standard on Conan 2.x class TestPackageConan(ConanFile): - settings = "os", "compiler", "build_type", "arch" - generators = "cmake", "cmake_find_package" + settings = "os", "arch", "compiler", "build_type" + generators = "CMakeDeps", "CMakeToolchain", "VirtualRunEnv" + test_type = "explicit" + + def requirements(self): + self.requires(self.tested_reference_str) + + def layout(self): + cmake_layout(self) def build(self): cmake = CMake(self) @@ -34,6 +44,6 @@ def build(self): cmake.build() def test(self): - if not tools.cross_building(self): - bin_path = os.path.join("bin", "test_package") - self.run(bin_path, run_environment=True) + if can_run(self): + bin_path = os.path.join(self.cpp.build.bindirs[0], "test_package") + self.run(bin_path, env="conanrun") diff --git a/ci/conan/all/CMakeLists.txt b/ci/conan/all/test_v1_package/CMakeLists.txt similarity index 83% rename from ci/conan/all/CMakeLists.txt rename to ci/conan/all/test_v1_package/CMakeLists.txt index cb849a6633c..faf547dec70 100644 --- a/ci/conan/all/CMakeLists.txt +++ b/ci/conan/all/test_v1_package/CMakeLists.txt @@ -21,9 +21,11 @@ # SOFTWARE. cmake_minimum_required(VERSION 3.1) -project(cmake_wrapper) -include(conanbuildinfo.cmake) -conan_basic_setup() +project(test_package) -add_subdirectory(source_subfolder/cpp) +include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) +conan_basic_setup(TARGETS) + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../test_package/ + ${CMAKE_CURRENT_BINARY_DIR}/test_package/) diff --git a/ci/conan/all/test_v1_package/conanfile.py b/ci/conan/all/test_v1_package/conanfile.py new file mode 100644 index 00000000000..4f5cc2b6101 --- /dev/null +++ b/ci/conan/all/test_v1_package/conanfile.py @@ -0,0 +1,40 @@ +# MIT License +# +# Copyright (c) 2019 Conan.io +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from conans import ConanFile, CMake +from conan.tools.build import cross_building +import os + + +class TestPackageV1Conan(ConanFile): + settings = "os", "arch", "compiler", "build_type" + generators = "cmake", "cmake_find_package_multi" + + def build(self): + cmake = CMake(self) + cmake.configure() + cmake.build() + + def test(self): + if not cross_building(self): + bin_path = os.path.join("bin", "test_package") + self.run(bin_path, run_environment=True) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index b38fe36930c..4e0975bc59e 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,6 +21,8 @@ # SOFTWARE. versions: + "10.0.0": + folder: all "8.0.1": folder: all "8.0.0": diff --git a/ci/conan/merge_status.sh b/ci/conan/merge_status.sh index daed3b81eb6..862d27ee3c5 100644 --- a/ci/conan/merge_status.sh +++ b/ci/conan/merge_status.sh @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -UPSTREAM_REVISION=232a32d832f9754b81dde348e8fd8ded37ad404b +UPSTREAM_REVISION=5c8f8538e32edd0911fd70710ce2d188bcd409f2 diff --git a/ci/conan/merge_upstream.sh b/ci/conan/merge_upstream.sh index 3d2c3334250..76af58f70df 100755 --- a/ci/conan/merge_upstream.sh +++ b/ci/conan/merge_upstream.sh @@ -37,7 +37,7 @@ git \ diff \ ${UPSTREAM_REVISION}..${UPSTREAM_HEAD} \ recipes/arrow | \ - (cd "${source_dir}" && patch -p3) + (cd "${source_dir}" && patch -p3 || :) sed \ -i.bak \ diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index dd313f19d70..4ca76a72158 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -15,28 +15,26 @@ # specific language governing permissions and limitations # under the License. -aws-sdk-cpp=1.8.186 +aws-sdk-cpp=1.10.13 benchmark>=1.6.0 boost-cpp>=1.68.0 brotli bzip2 c-ares cmake +flatbuffers gflags glog gmock>=1.10.0 google-cloud-cpp>=1.34.0 -# 1.45.0 appears to segfault on Windows/AppVeyor -grpc-cpp>=1.27.3,<1.45.0 +grpc-cpp gtest>=1.10.0 libprotobuf libutf8proc lz4-c make ninja -# Required by google-cloud-cpp, the Conda package is missing the dependency: -# https://github.com/conda-forge/google-cloud-cpp-feedstock/issues/28 -nlohmann_json +orc pkg-config python rapidjson @@ -46,4 +44,3 @@ thrift-cpp>=0.11.0 xsimd zlib zstd -flatbuffers diff --git a/ci/conda_env_gandiva_win.txt b/ci/conda_env_gandiva_win.txt index 9098b53d1f5..086ad97d947 100644 --- a/ci/conda_env_gandiva_win.txt +++ b/ci/conda_env_gandiva_win.txt @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -# llvmdev=9 or later require Visual Studio 2017 -clangdev=8 -llvmdev=8 +# ARROW-17830 Temporarily pin LLVM version on Appveyor due to a bug in Conda's packaging of LLVM 15. +clangdev<15 +llvmdev<15 diff --git a/ci/docker/almalinux-8-verify-rc.dockerfile b/ci/docker/almalinux-8-verify-rc.dockerfile index 94e8a1133db..e9544e6becc 100644 --- a/ci/docker/almalinux-8-verify-rc.dockerfile +++ b/ci/docker/almalinux-8-verify-rc.dockerfile @@ -18,40 +18,7 @@ ARG arch=amd64 FROM ${arch}/almalinux:8 -# A script to install dependencies required for release -# verification Red Hat Enterprise Linux 8 clones in particular -# on AlmaLinux 8 and Rocky Linux 8 - -RUN dnf -y install 'dnf-command(config-manager)' && \ - dnf config-manager --set-enabled powertools && \ - dnf -y update && \ - dnf -y module disable nodejs && \ - dnf -y module enable nodejs:16 && \ - dnf -y module disable ruby && \ - dnf -y module enable ruby:2.7 && \ - dnf -y groupinstall "Development Tools" && \ - dnf -y install \ - cmake \ - git \ - gobject-introspection-devel \ - java-1.8.0-openjdk-devel \ - libcurl-devel \ - llvm-devel \ - llvm-toolset \ - maven \ - ncurses-devel \ - ninja-build \ - nodejs \ - openssl-devel \ - python38-devel \ - python38-pip \ - ruby-devel \ - sqlite-devel \ - wget \ - which && \ +COPY dev/release/setup-rhel-rebuilds.sh / +RUN /setup-rhel-rebuilds.sh && \ + rm /setup-rhel-rebuilds.sh && \ dnf -y clean all - -RUN python3 -m pip install -U pip && \ - alternatives --set python /usr/bin/python3 - -RUN npm install -g yarn diff --git a/ci/docker/alpine-linux-3.16-cpp.dockerfile b/ci/docker/alpine-linux-3.16-cpp.dockerfile new file mode 100644 index 00000000000..4e25a9cbdbb --- /dev/null +++ b/ci/docker/alpine-linux-3.16-cpp.dockerfile @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch=amd64 +FROM ${arch}/alpine:3.16 + +RUN apk add \ + bash \ + benchmark-dev \ + boost-dev \ + brotli-dev \ + bzip2-dev \ + c-ares-dev \ + ccache \ + clang \ + cmake \ + curl-dev \ + g++ \ + gcc \ + gdb \ + gflags-dev \ + git \ + glog-dev \ + gmock \ + grpc-dev \ + gtest-dev \ + libxml2-dev \ + llvm13-dev \ + llvm13-static \ + lz4-dev \ + make \ + musl-locales \ + nlohmann-json \ + openssl-dev \ + perl \ + pkgconfig \ + protobuf-dev \ + py3-pip \ + py3-numpy-dev \ + python3-dev \ + rapidjson-dev \ + re2-dev \ + rsync \ + samurai \ + snappy-dev \ + sqlite-dev \ + thrift-dev \ + tzdata \ + utf8proc-dev \ + zlib-dev \ + zstd-dev && \ + rm -rf /var/cache/apk/* && \ + ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ + echo "Etc/UTC" > /etc/timezone + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +ENV ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_FLIGHT=ON \ + ARROW_FLIGHT_SQL=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HOME=/usr/local \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_PLASMA=ON \ + ARROW_S3=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=OFF \ + ARROW_WITH_MUSL=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ + ORC_SOURCE=BUNDLED \ + PATH=/usr/lib/ccache/:$PATH \ + xsimd_SOURCE=BUNDLED diff --git a/ci/docker/centos-7-cpp.dockerfile b/ci/docker/centos-7-cpp.dockerfile index 09a3234e3f8..f4e0430aad6 100644 --- a/ci/docker/centos-7-cpp.dockerfile +++ b/ci/docker/centos-7-cpp.dockerfile @@ -18,21 +18,27 @@ FROM centos:centos7 RUN yum install -y \ - diffutils \ - gcc-c++ \ - libcurl-devel \ - make \ - openssl-devel \ - wget \ - which + centos-release-scl \ + curl \ + diffutils \ + gcc-c++ \ + libcurl-devel \ + make \ + openssl-devel \ + wget \ + which + +# devtoolset is required for C++17 +RUN yum install -y devtoolset-8 # yum install cmake version is too old ARG cmake=3.23.1 RUN mkdir /opt/cmake-${cmake} RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-x86_64.tar.gz | \ - tar -xzf - --strip-components=1 -C /opt/cmake-${cmake} -ENV PATH=/opt/cmake-${cmake}/bin:$PATH -ENV CC=/usr/bin/gcc -ENV CXX=/usr/bin/g++ -ENV EXTRA_CMAKE_FLAGS="-DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX" -ENV ARROW_R_DEV=TRUE + tar -xzf - --strip-components=1 -C /opt/cmake-${cmake} + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN bash /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +ENV PATH=/opt/cmake-${cmake}/bin:$PATH \ + ARROW_R_DEV=TRUE diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index 72a839cf57c..c3db8cd2c4d 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -22,6 +22,10 @@ FROM ${repo}:${arch}-conda COPY ci/scripts/install_minio.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_minio.sh latest /opt/conda +# Unless overriden use Python 3.10 +# Google GCS fails building with Python 3.11 at the moment. +ARG python=3.10 + # install the required conda packages into the test environment COPY ci/conda_env_cpp.txt \ ci/conda_env_gandiva.txt \ @@ -31,6 +35,7 @@ RUN mamba install -q -y \ --file arrow/ci/conda_env_gandiva.txt \ compilers \ doxygen \ + python=${python} \ valgrind && \ mamba clean --all @@ -38,6 +43,9 @@ RUN mamba install -q -y \ COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_DEPENDENCY_SOURCE=CONDA \ @@ -59,7 +67,6 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ - CMAKE_CXX_STANDARD=17 \ GTest_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 8bcf5954d1d..a455ce381e9 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -27,6 +27,7 @@ ARG go=1.15 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ + RUN mamba install -q -y \ --file arrow/ci/conda_env_archery.txt \ "python>=3.7" \ @@ -62,6 +63,7 @@ ENV ARROW_BUILD_INTEGRATION=ON \ ARROW_DATASET=OFF \ ARROW_FILESYSTEM=OFF \ ARROW_FLIGHT=ON \ + ARROW_FLIGHT_SQL=ON \ ARROW_GANDIVA=OFF \ ARROW_HDFS=OFF \ ARROW_JEMALLOC=OFF \ diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index 30056ea42cf..94da3e2e094 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -42,12 +42,16 @@ COPY ci/etc/hdfs-site.xml $HADOOP_HOME/etc/hadoop/ # build cpp with tests ENV CC=gcc \ CXX=g++ \ + ARROW_BUILD_TESTS=ON \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ ARROW_FLIGHT=OFF \ ARROW_GANDIVA=OFF \ - ARROW_PLASMA=OFF \ - ARROW_PARQUET=ON \ - PARQUET_REQUIRE_ENCRYPTION=ON \ - ARROW_ORC=OFF \ ARROW_HDFS=ON \ - ARROW_PYTHON=ON \ - ARROW_BUILD_TESTS=ON + ARROW_JSON=ON \ + ARROW_ORC=OFF \ + ARROW_PARQUET=ON \ + ARROW_PLASMA=OFF \ + PARQUET_REQUIRE_ENCRYPTION=ON diff --git a/ci/docker/conda-python-kartothek.dockerfile b/ci/docker/conda-python-kartothek.dockerfile deleted file mode 100644 index 72b7628c23c..00000000000 --- a/ci/docker/conda-python-kartothek.dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG repo -ARG arch=amd64 -ARG python=3.8 -FROM ${repo}:${arch}-conda-python-${python} - -# install kartothek dependencies from conda-forge -RUN mamba install -c conda-forge -q -y \ - attrs \ - click \ - cloudpickle \ - dask \ - decorator \ - deprecation \ - freezegun \ - msgpack-python \ - prompt-toolkit \ - pytest-mock \ - pytest-xdist \ - pyyaml \ - simplejson \ - simplekv \ - storefact \ - toolz \ - urlquote \ - zstandard && \ - mamba clean --all - -ARG kartothek=latest -COPY ci/scripts/install_kartothek.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_kartothek.sh ${kartothek} /kartothek diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 95f85ef8ee2..861d83fe607 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -37,7 +37,11 @@ RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark # build cpp with tests ENV CC=gcc \ CXX=g++ \ - ARROW_PYTHON=ON \ - ARROW_HDFS=ON \ ARROW_BUILD_TESTS=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ SPARK_VERSION=${spark} diff --git a/ci/docker/conda-python-turbodbc.dockerfile b/ci/docker/conda-python-turbodbc.dockerfile deleted file mode 100644 index 15eecfe2fb3..00000000000 --- a/ci/docker/conda-python-turbodbc.dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG repo -ARG arch=amd64 -ARG python=3.8 -FROM ${repo}:${arch}-conda-python-${python} - -RUN export DEBIAN_FRONTEND=noninteractive && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - odbc-postgresql \ - postgresql \ - sudo && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# install turbodbc dependencies from conda-forge -RUN mamba install -c conda-forge -q -y \ - pybind11 \ - pytest-cov \ - mock \ - unixodbc && \ - mamba clean --all - -RUN service postgresql start && \ - sudo -u postgres psql -U postgres -c \ - "CREATE DATABASE test_db;" && \ - sudo -u postgres psql -U postgres -c \ - "ALTER USER postgres WITH PASSWORD 'password';" - -ARG turbodbc=latest -COPY ci/scripts/install_turbodbc.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_turbodbc.sh ${turbodbc} /turbodbc - -ENV TURBODBC_TEST_CONFIGURATION_FILES "query_fixtures_postgresql.json" diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 865a44a9182..85cf5f3a93b 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -37,10 +37,14 @@ RUN mamba install -q -y \ COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_gcs_testbench.sh default -ENV ARROW_PYTHON=ON \ - ARROW_BUILD_STATIC=OFF \ +ENV ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ ARROW_TENSORFLOW=ON \ - ARROW_USE_GLOG=OFF \ - ARROW_HDFS=ON + ARROW_USE_GLOG=OFF diff --git a/ci/docker/conda.dockerfile b/ci/docker/conda.dockerfile index d0545e3bf84..af7a2eceab9 100644 --- a/ci/docker/conda.dockerfile +++ b/ci/docker/conda.dockerfile @@ -21,7 +21,7 @@ FROM ${arch}/ubuntu:18.04 # install build essentials RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update -y -q && \ - apt-get install -y -q wget tzdata libc6-dbg gdb \ + apt-get install -y -q curl wget tzdata libc6-dbg gdb \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index a0872928c57..411fd52d3c3 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -25,21 +25,34 @@ RUN \ /etc/apt/sources.list.d/backports.list ARG llvm +# We can't use LLVM 14 or later from apt.llvm.org on i386 because LLVM +# 14 or later dropped support for i386. RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + dpkg-dev && \ + latest_available_llvm_i386=13 && \ + if [ $(dpkg-architecture -qDEB_HOST_ARCH) = "i386" -a \ + "${llvm}" -gt "${latest_available_llvm_i386}" ]; then \ + available_llvm="${latest_available_llvm_i386}"; \ + else \ + available_llvm="${llvm}"; \ + fi && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/buster/ llvm-toolchain-buster-${llvm} main" > \ + echo "deb https://apt.llvm.org/buster/ llvm-toolchain-buster-${available_llvm} main" > \ /etc/apt/sources.list.d/llvm.list && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ ccache \ - clang-${llvm} \ + clang-${available_llvm} \ cmake \ + curl \ g++ \ gcc \ gdb \ @@ -59,7 +72,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ - llvm-${llvm}-dev \ + llvm-${available_llvm}-dev \ make \ ninja-build \ nlohmann-json3-dev \ @@ -76,6 +89,9 @@ RUN apt-get update -y -q && \ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV absl_SOURCE=BUNDLED \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/debian-10-go.dockerfile b/ci/docker/debian-10-go.dockerfile index f0c0522081d..8d964c76a66 100644 --- a/ci/docker/debian-10-go.dockerfile +++ b/ci/docker/debian-10-go.dockerfile @@ -16,12 +16,15 @@ # under the License. ARG arch=amd64 -ARG go=1.15 +ARG go=1.17 +ARG staticcheck=v0.2.2 FROM ${arch}/golang:${go}-buster -RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 +# FROM collects all the args, get back the staticcheck version arg +ARG staticcheck -# TODO(kszucs): -# 1. add the files required to install the dependencies to .dockerignore -# 2. copy these files to their appropriate path -# 3. download and compile the dependencies +RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@${staticcheck} + +# Copy the go.mod and go.sum over and pre-download all the dependencies +COPY go/ /arrow/go +RUN cd /arrow/go && go mod download diff --git a/ci/docker/debian-11-cpp.dockerfile b/ci/docker/debian-11-cpp.dockerfile index a403df2368f..5f9fbb2afb6 100644 --- a/ci/docker/debian-11-cpp.dockerfile +++ b/ci/docker/debian-11-cpp.dockerfile @@ -22,21 +22,34 @@ ARG arch ENV DEBIAN_FRONTEND noninteractive ARG llvm +# We can't use LLVM 14 or later from apt.llvm.org on i386 because LLVM +# 14 or later dropped support for i386. RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + dpkg-dev && \ + latest_available_llvm_i386=13 && \ + if [ $(dpkg-architecture -qDEB_HOST_ARCH) = "i386" -a \ + "${llvm}" -gt "${latest_available_llvm_i386}" ]; then \ + available_llvm="${latest_available_llvm_i386}"; \ + else \ + available_llvm="${llvm}"; \ + fi && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${llvm} main" > \ + echo "deb https://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${available_llvm} main" > \ /etc/apt/sources.list.d/llvm.list && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ ccache \ - clang-${llvm} \ + clang-${available_llvm} \ cmake \ + curl \ g++ \ gcc \ gdb \ @@ -58,7 +71,7 @@ RUN apt-get update -y -q && \ libthrift-dev \ libutf8proc-dev \ libzstd-dev \ - llvm-${llvm}-dev \ + llvm-${available_llvm}-dev \ make \ ninja-build \ nlohmann-json3-dev \ @@ -78,6 +91,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV absl_SOURCE=BUNDLED \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ @@ -102,6 +118,7 @@ ENV absl_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ google_cloud_cpp_storage_SOURCE=BUNDLED \ + GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PATH=/usr/lib/ccache/:$PATH \ Protobuf_SOURCE=BUNDLED \ diff --git a/ci/docker/debian-11-go.dockerfile b/ci/docker/debian-11-go.dockerfile index 33f523e36aa..9f75bf23fdd 100644 --- a/ci/docker/debian-11-go.dockerfile +++ b/ci/docker/debian-11-go.dockerfile @@ -16,12 +16,14 @@ # under the License. ARG arch=amd64 -ARG go=1.16 +ARG go=1.17 +ARG staticcheck=v0.2.2 FROM ${arch}/golang:${go}-bullseye -RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 +# FROM collects all the args, get back the staticcheck version arg +ARG staticcheck +RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@${staticcheck} -# TODO(kszucs): -# 1. add the files required to install the dependencies to .dockerignore -# 2. copy these files to their appropriate path -# 3. download and compile the dependencies +# Copy the go.mod and go.sum over and pre-download all the dependencies +COPY go/ /arrow/go +RUN cd /arrow/go && go mod download diff --git a/ci/docker/fedora-35-cpp.dockerfile b/ci/docker/fedora-35-cpp.dockerfile index ce9c8857c85..aeb7c5b7951 100644 --- a/ci/docker/fedora-35-cpp.dockerfile +++ b/ci/docker/fedora-35-cpp.dockerfile @@ -30,6 +30,7 @@ RUN dnf update -y && \ ccache \ clang-devel \ cmake \ + curl \ curl-devel \ flatbuffers-devel \ gcc \ @@ -71,12 +72,14 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV absl_SOURCE=BUNDLED \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ ARROW_FLIGHT=ON \ - ARROW_GANDIVA_JAVA=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ diff --git a/ci/docker/java-jni-manylinux-201x.dockerfile b/ci/docker/java-jni-manylinux-201x.dockerfile index 52bdb9b923d..b3ecbf00a92 100644 --- a/ci/docker/java-jni-manylinux-201x.dockerfile +++ b/ci/docker/java-jni-manylinux-201x.dockerfile @@ -24,6 +24,7 @@ RUN vcpkg install \ --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=dev \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ @@ -33,5 +34,12 @@ RUN vcpkg install \ # Install Java ARG java=1.8.0 -RUN yum install -y java-$java-openjdk-devel && yum clean all +RUN yum install -y java-$java-openjdk-devel rh-maven35 && yum clean all ENV JAVA_HOME=/usr/lib/jvm/java-$java-openjdk/ + +# For ci/scripts/{cpp,java}_*.sh +ENV ARROW_HOME=/tmp/local \ + ARROW_JAVA_CDATA=ON \ + ARROW_JAVA_JNI=ON \ + ARROW_PLASMA=ON \ + ARROW_USE_CCACHE=ON diff --git a/ci/docker/java-jni-manylinux-aarch64-201x.dockerfile b/ci/docker/java-jni-manylinux-aarch64-201x.dockerfile new file mode 100644 index 00000000000..52bdb9b923d --- /dev/null +++ b/ci/docker/java-jni-manylinux-aarch64-201x.dockerfile @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +# Install the libaries required by the Gandiva to run +# Use enable llvm[enable-rtti] in the vcpkg.json to avoid link problems in Gandiva +RUN vcpkg install \ + --clean-after-build \ + --x-install-root=${VCPKG_ROOT}/installed \ + --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=flight \ + --x-feature=gcs \ + --x-feature=json \ + --x-feature=parquet \ + --x-feature=gandiva \ + --x-feature=s3 + +# Install Java +ARG java=1.8.0 +RUN yum install -y java-$java-openjdk-devel && yum clean all +ENV JAVA_HOME=/usr/lib/jvm/java-$java-openjdk/ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index a415f1d5a45..9b27358a69a 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -96,10 +96,15 @@ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ ENV ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ ARROW_FLIGHT=ON \ ARROW_GCS=ON \ ARROW_GLIB_VAPI=false \ - ARROW_PYTHON=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ ARROW_S3=ON \ ARROW_USE_GLOG=OFF \ CMAKE_UNITY_BUILD=ON diff --git a/ci/docker/linux-apt-jni.dockerfile b/ci/docker/linux-apt-jni.dockerfile index 92b6cf9a9fc..7b3e1b8416b 100644 --- a/ci/docker/linux-apt-jni.dockerfile +++ b/ci/docker/linux-apt-jni.dockerfile @@ -73,11 +73,10 @@ ENV PATH=/opt/cmake-${cmake}-Linux-x86_64/bin:$PATH ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_FLIGHT=OFF \ - ARROW_GANDIVA_JAVA=ON \ ARROW_GANDIVA=ON \ ARROW_HOME=/usr/local \ ARROW_JAVA_CDATA=ON \ - ARROW_JNI=ON \ + ARROW_JAVA_JNI=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_PLASMA_JAVA_CLIENT=ON \ diff --git a/ci/docker/linux-apt-python-3.dockerfile b/ci/docker/linux-apt-python-3.dockerfile index ece7cf09129..19f3666ced4 100644 --- a/ci/docker/linux-apt-python-3.dockerfile +++ b/ci/docker/linux-apt-python-3.dockerfile @@ -39,8 +39,19 @@ RUN pip install \ -r arrow/python/requirements-build.txt \ -r arrow/python/requirements-test.txt -ENV ARROW_PYTHON=ON \ - ARROW_BUILD_STATIC=OFF \ +ARG numba +COPY ci/scripts/install_numba.sh /arrow/ci/scripts/ +RUN if [ "${numba}" != "" ]; then \ + /arrow/ci/scripts/install_numba.sh ${numba} \ + ; fi + +ENV ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ - ARROW_USE_GLOG=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ + ARROW_USE_GLOG=OFF diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 7083bfa3d95..971078b3601 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -103,13 +103,18 @@ ENV \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ ARROW_FLIGHT=OFF \ ARROW_GANDIVA=OFF \ + ARROW_HDFS=OFF \ + ARROW_JSON=ON \ ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_PLASMA=OFF \ - ARROW_PYTHON=ON \ ARROW_S3=ON \ ARROW_USE_CCACHE=ON \ ARROW_USE_GLOG=OFF \ diff --git a/ci/docker/linux-dnf-python-3.dockerfile b/ci/docker/linux-dnf-python-3.dockerfile index 62dc72899e7..c37febb4e00 100644 --- a/ci/docker/linux-dnf-python-3.dockerfile +++ b/ci/docker/linux-dnf-python-3.dockerfile @@ -36,8 +36,13 @@ RUN pip install \ -r arrow/python/requirements-build.txt \ -r arrow/python/requirements-test.txt -ENV ARROW_PYTHON=ON \ - ARROW_BUILD_STATIC=OFF \ +ENV ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ - ARROW_USE_GLOG=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ + ARROW_USE_GLOG=OFF diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index 804fb09f09c..d368a6629c5 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -27,7 +27,7 @@ ENV R_BIN=${r_bin} ARG r_dev=FALSE ENV ARROW_R_DEV=${r_dev} -ARG devtoolset_version=-1 +ARG devtoolset_version= ENV DEVTOOLSET_VERSION=${devtoolset_version} ARG r_prune_deps=FALSE @@ -45,13 +45,14 @@ ENV PATH "${RPREFIX}/bin:${PATH}" # Patch up some of the docker images COPY ci/scripts/r_docker_configure.sh /arrow/ci/scripts/ COPY ci/etc/rprofile /arrow/ci/etc/ +COPY ci/scripts/r_install_system_dependencies.sh /arrow/ci/scripts/ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/r_docker_configure.sh -# Set up Python 3 and its dependencies -RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ - ln -s /usr/bin/pip3 /usr/local/bin/pip +# this has to come after r_docker_configure to ensure curl is installed +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index 4f74b8b1c59..adab10da623 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -75,8 +75,7 @@ RUN vcpkg install \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ - --x-feature=parquet \ - --x-feature=s3 + --x-feature=parquet ARG python=3.8 ENV PYTHON_VERSION=${python} diff --git a/ci/docker/python-wheel-windows-test-vs2017.dockerfile b/ci/docker/python-wheel-windows-test-vs2017.dockerfile new file mode 100644 index 00000000000..a4c836ef4f0 --- /dev/null +++ b/ci/docker/python-wheel-windows-test-vs2017.dockerfile @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: You must update PYTHON_WHEEL_WINDOWS_IMAGE_REVISION in .env +# when you update this file. + +# based on mcr.microsoft.com/windows/servercore:ltsc2019 +# contains choco and vs2017 preinstalled +FROM abrarov/msvc-2017:2.11.0 + +# Add unix tools to path +RUN setx path "%path%;C:\Program Files\Git\usr\bin" + +# Remove previous installations of python from the base image +# NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7 +# and the msi installers are failing to remove pip and tcl/tk "products" making +# the subsequent choco python installation step failing for installing python +# version 3.9.* due to existing python version +RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ + rm -rf Python* + +# Define the full version number otherwise choco falls back to patch number 0 (3.7 => 3.7.0) +ARG python=3.8 +RUN (if "%python%"=="3.7" setx PYTHON_VERSION "3.7.9" && setx PATH "%PATH%;C:\Python37;C:\Python37\Scripts") & \ + (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ + (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.8" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.0" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") +RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% +RUN python -m pip install -U pip setuptools diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index 247f13a15cf..f82a47a0576 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -80,8 +80,9 @@ RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ ARG python=3.8 RUN (if "%python%"=="3.7" setx PYTHON_VERSION "3.7.9" && setx PATH "%PATH%;C:\Python37;C:\Python37\Scripts") & \ (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.7" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.2" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") + (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.8" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.0" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile b/ci/docker/ubuntu-18.04-cpp.dockerfile index 0e20b7c6a83..715cc3424f0 100644 --- a/ci/docker/ubuntu-18.04-cpp.dockerfile +++ b/ci/docker/ubuntu-18.04-cpp.dockerfile @@ -30,26 +30,40 @@ ENV DEBIAN_FRONTEND=noninteractive # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN apt-get update -y -q && \ +# We can't use LLVM 14 from apt.llvm.org because LLVM 14 requires libgcc-s1 +# but libgcc-s1 is available since Ubuntu 20.04. +RUN latest_available_llvm=13 && \ + if [ "${llvm}" -gt "${latest_available_llvm}" ]; then \ + available_llvm="${latest_available_llvm}"; \ + else \ + available_llvm="${llvm}"; \ + fi && \ + if [ "${clang_tools}" -gt "${latest_available_llvm}" ]; then \ + available_clang_tools="${latest_available_llvm}"; \ + else \ + available_clang_tools="${clang_tools}"; \ + fi && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-${llvm} main" > \ + echo "deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-${available_llvm} main" > \ /etc/apt/sources.list.d/llvm.list && \ - if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -ge 10 ]; then \ - echo "deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-${clang_tools} main" > \ + if [ "${available_clang_tools}" -ne "${available_llvm}" -a \ + "${available_clang_tools}" -ge 10 ]; then \ + echo "deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-${available_clang_tools} main" > \ /etc/apt/sources.list.d/clang-tools.list; \ fi && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ - clang-${clang_tools} \ - clang-${llvm} \ - clang-format-${clang_tools} \ - clang-tidy-${clang_tools} \ - llvm-${llvm}-dev && \ + clang-${available_clang_tools} \ + clang-${available_llvm} \ + clang-format-${available_clang_tools} \ + clang-tidy-${available_clang_tools} \ + llvm-${available_llvm}-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists* @@ -60,6 +74,7 @@ RUN apt-get update -y -q && \ ca-certificates \ ccache \ cmake \ + curl \ g++ \ gcc \ gdb \ @@ -73,11 +88,19 @@ RUN apt-get update -y -q && \ libcurl4-openssl-dev \ libgflags-dev \ libgoogle-glog-dev \ + libidn2-dev \ + libkrb5-dev \ + libldap-dev \ liblz4-dev \ + libnghttp2-dev \ libprotobuf-dev \ libprotoc-dev \ + libpsl-dev \ libre2-dev \ + librtmp-dev \ libsnappy-dev \ + libssh-dev \ + libssh2-1-dev \ libssl-dev \ ninja-build \ pkg-config \ @@ -100,6 +123,10 @@ RUN apt-get update -y -q && \ # - s3 tests would require boost-asio that is included since Boost 1.66.0 # ARROW-17051: this build uses static Protobuf, so we must also use # static Arrow to run Flight/Flight SQL tests + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV ARROW_BUILD_STATIC=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index f77ff40e5fb..ca2be2873d6 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -28,6 +28,7 @@ RUN apt-get update -y -q && \ build-essential \ ccache \ cmake \ + curl \ git \ libssl-dev \ libcurl4-openssl-dev \ @@ -70,6 +71,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_FLIGHT=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index dd36aff84c5..6cf48c56aa4 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -68,6 +68,7 @@ RUN apt-get update -y -q && \ ca-certificates \ ccache \ cmake \ + curl \ g++ \ gcc \ gdb \ @@ -81,12 +82,20 @@ RUN apt-get update -y -q && \ libcurl4-openssl-dev \ libgflags-dev \ libgoogle-glog-dev \ + libidn2-dev \ + libkrb5-dev \ + libldap-dev \ liblz4-dev \ + libnghttp2-dev \ libprotobuf-dev \ libprotoc-dev \ + libpsl-dev \ libradospp-dev \ libre2-dev \ + librtmp-dev \ libsnappy-dev \ + libssh-dev \ + libssh2-1-dev \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ @@ -116,6 +125,9 @@ RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_ceph.sh +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages # provided by the distribution: diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 8bc5ab3e484..f0dc76c65f9 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -28,6 +28,7 @@ RUN apt-get update -y -q && \ build-essential \ ccache \ cmake \ + curl \ git \ libssl-dev \ libcurl4-openssl-dev \ @@ -70,6 +71,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ ARROW_FLIGHT=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 05aca53151b..d47614ed2c9 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -68,6 +68,7 @@ RUN apt-get update -y -q && \ ca-certificates \ ccache \ cmake \ + curl \ gdb \ git \ libbenchmark-dev \ @@ -80,13 +81,21 @@ RUN apt-get update -y -q && \ libgflags-dev \ libgoogle-glog-dev \ libgrpc++-dev \ + libidn2-dev \ + libkrb5-dev \ + libldap-dev \ liblz4-dev \ + libnghttp2-dev \ libprotobuf-dev \ libprotoc-dev \ + libpsl-dev \ libre2-dev \ + librtmp-dev \ libsnappy-dev \ - libssl-dev \ libsqlite3-dev \ + libssh-dev \ + libssh2-1-dev \ + libssl-dev \ libthrift-dev \ libutf8proc-dev \ libzstd-dev \ @@ -112,7 +121,7 @@ RUN if [ "${gcc_version}" = "" ]; then \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "11" ]; then \ + if [ "${gcc_version}" -gt "12" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends software-properties-common && \ add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ @@ -143,6 +152,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages # provided by the distribution: diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index f0a09bab7f0..dcb4154cc14 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=9.0.0.9000 +pkgver=11.0.0 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") @@ -73,26 +73,8 @@ build() { # set the appropriate compiler definition. export CPPFLAGS="-DUTF8PROC_STATIC" - # This is the difference between rtools-packages and rtools-backports - # Remove this when submitting to rtools-packages - if [ "$RTOOLS_VERSION" = "35" ]; then - export CC="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/gcc" - export CXX="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/g++" - export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" - export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" - export LIBS="-L${MINGW_PREFIX}/libs" - export ARROW_GCS=OFF - export ARROW_S3=OFF - export ARROW_WITH_RE2=OFF - # Without this, some dataset functionality segfaults - export CMAKE_UNITY_BUILD=ON - else - export ARROW_GCS=ON - export ARROW_S3=ON - export ARROW_WITH_RE2=ON - # Without this, some compute functionality segfaults in tests - export CMAKE_UNITY_BUILD=OFF - fi + # CMAKE_UNITY_BUILD is set to OFF as otherwise some compute functionality + # segfaults in tests MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \ ${MINGW_PREFIX}/bin/cmake.exe \ @@ -105,7 +87,7 @@ build() { -DARROW_CSV=ON \ -DARROW_DATASET=ON \ -DARROW_FILESYSTEM=ON \ - -DARROW_GCS="${ARROW_GCS}" \ + -DARROW_GCS=ON \ -DARROW_HDFS=OFF \ -DARROW_JEMALLOC=OFF \ -DARROW_JSON=ON \ @@ -113,13 +95,13 @@ build() { -DARROW_MIMALLOC=ON \ -DARROW_PACKAGE_PREFIX="${MINGW_PREFIX}" \ -DARROW_PARQUET=ON \ - -DARROW_S3="${ARROW_S3}" \ + -DARROW_S3=ON \ -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_UTF8PROC_USE_SHARED=OFF \ -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \ + -DARROW_WITH_RE2=ON \ -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ @@ -129,7 +111,7 @@ build() { -DARROW_CXXFLAGS="${CPPFLAGS}" \ -DCMAKE_BUILD_TYPE="release" \ -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DCMAKE_UNITY_BUILD=OFF \ -DCMAKE_VERBOSE_MAKEFILE=ON make -j3 diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index 3b1d9b6c977..3bdd6a0bd5f 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -46,6 +46,9 @@ fi if [ -n "${ARROW_CONAN_WITH_JEMALLOC:-}" ]; then conan_args+=(--options arrow:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) fi +if [ -n "${ARROW_CONAN_WITH_JSON:-}" ]; then + conan_args+=(--options arrow:with_json=${ARROW_CONAN_WITH_JSON}) +fi if [ -n "${ARROW_CONAN_WITH_LZ4:-}" ]; then conan_args+=(--options arrow:with_lz4=${ARROW_CONAN_WITH_LZ4}) fi diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 738f3d26470..b3d9e0d3ec1 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -91,23 +91,19 @@ cmake \ -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ - -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA:-OFF} \ -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ -DARROW_GCS=${ARROW_GCS:-OFF} \ -DARROW_HDFS=${ARROW_HDFS:-ON} \ -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ - -DARROW_JNI=${ARROW_JNI:-OFF} \ -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ -DARROW_ORC=${ARROW_ORC:-OFF} \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ - -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT:-OFF} \ -DARROW_PLASMA=${ARROW_PLASMA:-OFF} \ - -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ @@ -127,6 +123,7 @@ cmake \ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ + -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ @@ -141,7 +138,7 @@ cmake \ -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ -DCMAKE_C_FLAGS="${CFLAGS:-}" \ -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ - -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-11}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ @@ -172,7 +169,7 @@ time cmake --build . --target install popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig + ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then @@ -180,6 +177,11 @@ if [ "${ARROW_USE_CCACHE}" == "ON" ]; then ccache -s fi +if command -v sccache &> /dev/null; then + echo "=== sccache stats after the build ===" + sccache --show-stats +fi + if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc doxygen diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 2bd7db8b2c4..2d829411b16 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -55,22 +55,20 @@ case "$(uname)" in exclude_tests="gandiva-internals-test" exclude_tests="${exclude_tests}|gandiva-projector-test" exclude_tests="${exclude_tests}|gandiva-utf8-test" - if [ "${MSYSTEM}" = "MINGW32" ]; then - exclude_tests="${exclude_tests}|gandiva-binary-test" - exclude_tests="${exclude_tests}|gandiva-boolean-expr-test" - exclude_tests="${exclude_tests}|gandiva-date-time-test" - exclude_tests="${exclude_tests}|gandiva-decimal-single-test" - exclude_tests="${exclude_tests}|gandiva-decimal-test" - exclude_tests="${exclude_tests}|gandiva-filter-project-test" - exclude_tests="${exclude_tests}|gandiva-filter-test" - exclude_tests="${exclude_tests}|gandiva-hash-test" - exclude_tests="${exclude_tests}|gandiva-if-expr-test" - exclude_tests="${exclude_tests}|gandiva-in-expr-test" - exclude_tests="${exclude_tests}|gandiva-literal-test" - exclude_tests="${exclude_tests}|gandiva-null-validity-test" - exclude_tests="${exclude_tests}|gandiva-precompiled-test" - exclude_tests="${exclude_tests}|gandiva-projector-test" - fi + exclude_tests="${exclude_tests}|gandiva-binary-test" + exclude_tests="${exclude_tests}|gandiva-boolean-expr-test" + exclude_tests="${exclude_tests}|gandiva-date-time-test" + exclude_tests="${exclude_tests}|gandiva-decimal-single-test" + exclude_tests="${exclude_tests}|gandiva-decimal-test" + exclude_tests="${exclude_tests}|gandiva-filter-project-test" + exclude_tests="${exclude_tests}|gandiva-filter-test" + exclude_tests="${exclude_tests}|gandiva-hash-test" + exclude_tests="${exclude_tests}|gandiva-if-expr-test" + exclude_tests="${exclude_tests}|gandiva-in-expr-test" + exclude_tests="${exclude_tests}|gandiva-literal-test" + exclude_tests="${exclude_tests}|gandiva-null-validity-test" + exclude_tests="${exclude_tests}|gandiva-precompiled-test" + exclude_tests="${exclude_tests}|gandiva-projector-test" ctest_options+=(--exclude-regex "${exclude_tests}") ;; *) @@ -80,14 +78,14 @@ esac pushd ${build_dir} -if ! which python > /dev/null 2>&1; then - export PYTHON=python3 +if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then + export PYTHON="${PYTHON:-python3}" fi ctest \ --label-regex unittest \ --output-on-failure \ --parallel ${n_jobs} \ - --timeout 300 \ + --timeout ${ARROW_CTEST_TIMEOUT:-300} \ "${ctest_options[@]}" \ $@ diff --git a/ci/scripts/download_tz_database.sh b/ci/scripts/download_tz_database.sh old mode 100644 new mode 100755 diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh new file mode 100644 index 00000000000..523acdd9764 --- /dev/null +++ b/ci/scripts/go_bench.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# this will output the benchmarks to STDOUT but if `-json` is passed +# as the second argument, it will create a file "bench_stats.json" +# in the directory this is called from containing a json representation + +set -ex + +# simplistic semver comparison +verlte() { + [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] +} +verlt() { + [ "$1" = "$2" ] && return 1 || verlte $1 $2 +} + +ver=`go env GOVERSION` + +source_dir=${1}/go + +export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data +pushd ${source_dir} + +# lots of benchmarks, they can take a while +# the timeout is for *ALL* benchmarks together, +# not per benchmark +go test -bench=. -benchmem -timeout 20m -run=^$ ./... | tee bench_stat.dat + +popd + +if [[ "$2" = "-json" ]]; then + go install go.bobheadxi.dev/gobenchdata@latest + export PATH=`go env GOPATH`/bin:$PATH + cat ${source_dir}/bench_*.dat | gobenchdata --json bench_stats.json +fi + +rm ${source_dir}/bench_*.dat \ No newline at end of file diff --git a/ci/scripts/go_bench_adapt.py b/ci/scripts/go_bench_adapt.py new file mode 100644 index 00000000000..a0f4a1dc19c --- /dev/null +++ b/ci/scripts/go_bench_adapt.py @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import os +import uuid +import logging +from pathlib import Path +from typing import List + +from benchadapt import BenchmarkResult +from benchadapt.adapters import BenchmarkAdapter +from benchadapt.log import log + +log.setLevel(logging.DEBUG) + +ARROW_ROOT = Path(__file__).parent.parent.parent.resolve() +SCRIPTS_PATH = ARROW_ROOT / "ci" / "scripts" + +if os.environ.get("CONBENCH_REF") == "master": + github = { + "repository": os.environ["GITHUB_REPOSITORY"], + "commit": os.environ["GITHUB_SHA"], + "pr_number": None, # implying default branch + } + run_reason = "commit" +else: + github = None # scrape github info from the local repo + run_reason = "branch" + +class GoAdapter(BenchmarkAdapter): + result_file = "bench_stats.json" + command = ["bash", SCRIPTS_PATH / "go_bench.sh", ARROW_ROOT, "-json"] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(command=self.command, *args, **kwargs) + + def _transform_results(self) -> List[BenchmarkResult]: + with open(self.result_file, "r") as f: + raw_results = json.load(f) + + run_id = uuid.uuid4().hex + parsed_results = [] + for suite in raw_results[0]["Suites"]: + batch_id = uuid.uuid4().hex + pkg = suite["Pkg"] + + for benchmark in suite["Benchmarks"]: + data = benchmark["Mem"]["MBPerSec"] * 1e6 + time = 1 / benchmark["NsPerOp"] * 1e9 + + name = benchmark["Name"].removeprefix('Benchmark') + ncpu = name[name.rfind('-')+1:] + pieces = name[:-(len(ncpu)+1)].split('/') + + parsed = BenchmarkResult( + run_id=run_id, + batch_id=batch_id, + stats={ + "data": [data], + "unit": "b/s", + "times": [time], + "time_unit": "i/s", + "iterations": benchmark["Runs"], + }, + context={ + "benchmark_language": "Go", + "goos": suite["Goos"], + "goarch": suite["Goarch"], + }, + tags={ + "pkg": pkg, + "num_cpu": ncpu, + "name": pieces[0], + "params": '/'.join(pieces[1:]), + }, + run_reason=run_reason, + github=github, + ) + parsed.run_name = f"{parsed.run_reason}: {parsed.github['commit']}" + parsed_results.append(parsed) + + return parsed_results + + +if __name__ == "__main__": + go_adapter = GoAdapter(result_fields_override={"info":{}}) + go_adapter() diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index 20879cc0e70..c113bbd320e 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -22,7 +22,7 @@ set -ex source_dir=${1}/go ARCH=`uname -m` -# Arm64 CI is triggered by travis and run in arm64v8/golang:1.16-bullseye +# Arm64 CI is triggered by travis and run in arm64v8/golang:1.17-bullseye if [ "aarch64" == "$ARCH" ]; then # Install `staticcheck` GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@v0.2.2 @@ -36,17 +36,15 @@ if [[ -n "${ARROW_GO_TESTCGO}" ]]; then go clean -cache go clean -testcache fi - TAGS="-tags assert,test,ccalloc" + TAGS="-tags assert,test,ccalloc" fi -go get -d -t -v ./... go install $TAGS -v ./... popd pushd ${source_dir}/parquet -go get -d -t -v ./... go install -v ./... popd diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh index 760aa149aa9..e0a632dc069 100755 --- a/ci/scripts/go_test.sh +++ b/ci/scripts/go_test.sh @@ -19,10 +19,24 @@ set -ex +# simplistic semver comparison +verlte() { + [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] +} +verlt() { + [ "$1" = "$2" ] && return 1 || verlte $1 $2 +} + +ver=`go env GOVERSION` + source_dir=${1}/go -# when we upgrade to at least go1.18, we can add the new -asan option here testargs="-race" +if verlte "1.18" "${ver#go}" && [ "$(go env GOOS)" != "darwin" ]; then + # asan not supported on darwin/amd64 + testargs="-asan" +fi + case "$(uname)" in MINGW*) # -asan and -race don't work on windows currently @@ -45,9 +59,9 @@ fi pushd ${source_dir}/arrow TAGS="assert,test" -if [[ -n "${ARROW_GO_TESTCGO}" ]]; then +if [[ -n "${ARROW_GO_TESTCGO}" ]]; then if [[ "${MSYSTEM}" = "MINGW64" ]]; then - export PATH=${MINGW_PREFIX}/bin:$PATH + export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH fi TAGS="${TAGS},ccalloc" fi @@ -57,18 +71,19 @@ fi # tag in order to run its tests so that the testing functions implemented # in .c files don't get included in non-test builds. -for d in $(go list ./... | grep -v vendor); do - go test $testargs -tags $TAGS $d -done +go test $testargs -tags $TAGS ./... + +# only test compute when Go is >= 1.18 +if verlte "1.18" "${ver#go}"; then + go test $testargs -tags $TAGS ./compute/... +fi popd export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data - +export ARROW_TEST_DATA=${1}/testing/data pushd ${source_dir}/parquet -for d in $(go list ./... | grep -v vendor); do - go test $testargs -tags assert $d -done +go test $testargs -tags assert ./... popd diff --git a/ci/scripts/install_ccache.sh b/ci/scripts/install_ccache.sh index 8c64fe56c41..7d39e18ebe5 100755 --- a/ci/scripts/install_ccache.sh +++ b/ci/scripts/install_ccache.sh @@ -26,20 +26,33 @@ fi version=$1 prefix=$2 -url="https://github.com/ccache/ccache/archive/v${version}.tar.gz" -mkdir /tmp/ccache -wget -q ${url} -O - | tar -xzf - --directory /tmp/ccache --strip-components=1 +mkdir -p /tmp/ccache +case $(uname) in + MINGW64*) + url="https://github.com/ccache/ccache/releases/download/v${version}/ccache-${version}-windows-x86_64.zip" + pushd /tmp/ccache + curl --fail --location --remote-name ${url} + unzip -j ccache-${version}-windows-x86_64.zip + chmod +x ccache.exe + mv ccache.exe ${prefix}/bin/ + popd + ;; + *) + url="https://github.com/ccache/ccache/archive/v${version}.tar.gz" -mkdir /tmp/ccache/build -pushd /tmp/ccache/build -cmake \ - -GNinja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=${prefix} \ - -DZSTD_FROM_INTERNET=ON \ - .. -ninja install -popd + wget -q ${url} -O - | tar -xzf - --directory /tmp/ccache --strip-components=1 + mkdir /tmp/ccache/build + pushd /tmp/ccache/build + cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=${prefix} \ + -DZSTD_FROM_INTERNET=ON \ + .. + ninja install + popd + ;; +esac rm -rf /tmp/ccache diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index eb9c4e3dd42..8d712a88a6a 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -26,7 +26,7 @@ fi dask=$1 -if [ "${dask}" = "master" ]; then +if [ "${dask}" = "upstream_devel" ]; then pip install https://github.com/dask/dask/archive/main.tar.gz#egg=dask[dataframe] elif [ "${dask}" = "latest" ]; then pip install dask[dataframe] diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index f7fa6e611d4..0109ea607ff 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -34,24 +34,9 @@ case "$(uname -m)" in ;; esac -case "$(uname -s)-$(uname -m)" in - Darwin-arm64) - # Workaround for https://github.com/grpc/grpc/issues/28387 . - # Build grpcio instead of using wheel. - # storage-testbench 0.16.0 pins grpcio to 1.44.0. - ${PYTHON:-python3} -m pip install --no-binary :all: "grpcio==1.44.0" - ;; - *_NT-*) - # Mingw-w64: MSYS_NT-10.0-19043, MINGW32_NT-10.0-19043, MINGW64_NT-10.0-19043 - # Don't use the "/MT" option because g++ doesn't recognize it. - # "/MT" is for Visual Studio. - GRPC_PYTHON_CFLAGS=" " ${PYTHON:-python3} -m pip install "grpcio==1.44.0" - ;; -esac - version=$1 if [[ "${version}" -eq "default" ]]; then - version="v0.16.0" + version="v0.32.0" fi ${PYTHON:-python3} -m pip install \ diff --git a/ci/scripts/install_turbodbc.sh b/ci/scripts/install_numba.sh similarity index 64% rename from ci/scripts/install_turbodbc.sh rename to ci/scripts/install_numba.sh index e9ac26c2cc8..470f291ba80 100755 --- a/ci/scripts/install_turbodbc.sh +++ b/ci/scripts/install_numba.sh @@ -19,23 +19,17 @@ set -e -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " exit 1 fi -turbodbc=$1 -target=$2 +numba=$1 -git clone --recurse-submodules https://github.com/blue-yonder/turbodbc "${target}" -if [ "${turbodbc}" = "latest" ]; then - git -C "${target}" checkout $(git describe --tags); +if [ "${numba}" = "master" ]; then + pip install https://github.com/numba/numba/archive/main.tar.gz#egg=numba +elif [ "${numba}" = "latest" ]; then + pip install numba else - git -C "${target}" checkout ${turbodbc}; + pip install numba==${numba} fi - -pushd ${target} -wget -q https://github.com/pybind/pybind11/archive/v2.6.2.tar.gz -tar xvf v2.6.2.tar.gz -mv pybind11-2.6.2 pybind11 -popd diff --git a/ci/scripts/install_osx_sdk.sh b/ci/scripts/install_osx_sdk.sh deleted file mode 100755 index 896d084e0b9..00000000000 --- a/ci/scripts/install_osx_sdk.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -if [ ${using_homebrew} != "yes" ]; then - export MACOSX_DEPLOYMENT_TARGET="10.9" - export CONDA_BUILD_SYSROOT="$(xcode-select -p)/Platforms/MacOSX.platform/Developer/SDKs/MacOSX${MACOSX_DEPLOYMENT_TARGET}.sdk" - - if [[ ! -d ${CONDA_BUILD_SYSROOT} || "$OSX_FORCE_SDK_DOWNLOAD" == "1" ]]; then - echo "downloading ${macosx_deployment_target} sdk" - curl -L -O https://github.com/phracker/MacOSX-SDKs/releases/download/10.13/MacOSX${MACOSX_DEPLOYMENT_TARGET}.sdk.tar.xz - tar -xf MacOSX${MACOSX_DEPLOYMENT_TARGET}.sdk.tar.xz -C "$(dirname "$CONDA_BUILD_SYSROOT")" - # set minimum sdk version to our target - plutil -replace MinimumSDKVersion -string ${MACOSX_DEPLOYMENT_TARGET} $(xcode-select -p)/Platforms/MacOSX.platform/Info.plist - plutil -replace DTSDKName -string macosx${MACOSX_DEPLOYMENT_TARGET}internal $(xcode-select -p)/Platforms/MacOSX.platform/Info.plist - fi - - if [ -d "${CONDA_BUILD_SYSROOT}" ]; then - echo "Found CONDA_BUILD_SYSROOT: ${CONDA_BUILD_SYSROOT}" - else - echo "Missing CONDA_BUILD_SYSROOT: ${CONDA_BUILD_SYSROOT}" - exit 1 - fi -fi diff --git a/ci/scripts/install_pandas.sh b/ci/scripts/install_pandas.sh index 5aca65f825a..f0cb76fb663 100755 --- a/ci/scripts/install_pandas.sh +++ b/ci/scripts/install_pandas.sh @@ -35,8 +35,8 @@ else pip install numpy==${numpy} fi -if [ "${pandas}" = "master" ]; then - pip install git+https://github.com/pandas-dev/pandas.git --no-build-isolation +if [ "${pandas}" = "upstream_devel" ]; then + pip install git+https://github.com/pandas-dev/pandas.git elif [ "${pandas}" = "nightly" ]; then pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre pandas elif [ "${pandas}" = "latest" ]; then diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh index 7a18cd83243..d64318751c9 100755 --- a/ci/scripts/install_python.sh +++ b/ci/scripts/install_python.sh @@ -27,8 +27,9 @@ platforms=([windows]=Windows declare -A versions versions=([3.7]=3.7.9 [3.8]=3.8.10 - [3.9]=3.9.9 - [3.10]=3.10.1) + [3.9]=3.9.13 + [3.10]=3.10.8 + [3.11]=3.11.0) if [ "$#" -ne 2 ]; then echo "Usage: $0 " @@ -45,7 +46,7 @@ full_version=${versions[$2]} if [ $platform = "MacOSX" ]; then echo "Downloading Python installer..." - if [ "$(uname -m)" = "arm64" ] || [ "$version" = "3.10" ]; then + if [ "$(uname -m)" = "arm64" ] || [ "$version" = "3.10" ] || [ "$version" = "3.11" ]; then fname="python-${full_version}-macos11.pkg" else fname="python-${full_version}-macosx10.9.pkg" diff --git a/ci/scripts/install_sccache.sh b/ci/scripts/install_sccache.sh new file mode 100755 index 00000000000..2ee3486699a --- /dev/null +++ b/ci/scripts/install_sccache.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +if [ "$#" -lt 1 -o "$#" -gt 3 ]; then + echo "Usage: $0 " + echo "Will default to version=0.3.0 " + exit 1 +fi + +BUILD=$1 +PREFIX=$2 +VERSION=${3:-0.3.0} +ARCH=$(uname -m) + +if [ "${ARCH}" != x86_64 ] && [ "${ARCH}" != aarch64 ]; then + echo "Skipped sccache installation on unsupported arch: ${ARCH}" + exit 0 +fi + +SCCACHE_URL="https://github.com/mozilla/sccache/releases/download/v$VERSION/sccache-v$VERSION-$ARCH-$BUILD.tar.gz" +SCCACHE_ARCHIVE=sccache.tar.gz + +# Download archive and checksum +curl -L $SCCACHE_URL --output $SCCACHE_ARCHIVE +curl -L $SCCACHE_URL.sha256 --output $SCCACHE_ARCHIVE.sha256 +echo " $SCCACHE_ARCHIVE" >> $SCCACHE_ARCHIVE.sha256 + +SHA_ARGS="--check --status" + +# Busybox sha256sum uses different flags +if sha256sum --version 2>&1 | grep -q BusyBox; then + SHA_ARGS="-sc" +fi + +sha256sum $SHA_ARGS $SCCACHE_ARCHIVE.sha256 + +if [ ! -d $PREFIX ]; then + mkdir -p $PREFIX +fi + +# Extract only the sccache binary into $PREFIX and ignore README and LCIENSE. +# --wildcards doesn't work on busybox. +tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory $PREFIX --exclude="sccache*/*E*E*" +chmod u+x $PREFIX/sccache + +if [ "${GITHUB_ACTIONS}" = "true" ]; then + echo "$PREFIX" >> $GITHUB_PATH + # Add executable for windows as mingw workaround. + echo "SCCACHE_PATH=$PREFIX/sccache.exe" >> $GITHUB_ENV +fi diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh index eeaba715b6a..02977585910 100755 --- a/ci/scripts/integration_dask.sh +++ b/ci/scripts/integration_dask.sh @@ -33,6 +33,7 @@ python -c "import dask.dataframe" pytest -v --pyargs dask.dataframe.tests.test_dataframe pytest -v --pyargs dask.dataframe.io.tests.test_orc -pytest -v --pyargs dask.dataframe.io.tests.test_parquet +# skip test until new fsspec release is out (https://github.com/fsspec/filesystem_spec/pull/1139) +pytest -v --pyargs dask.dataframe.io.tests.test_parquet -k "not test_pyarrow_filesystem_option" # this file contains parquet tests that use S3 filesystem pytest -v --pyargs dask.bytes.tests.test_s3 diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index 90ecbce397b..6e20e770329 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -30,7 +30,7 @@ spark_version=${SPARK_VERSION:-master} # Use old behavior that always dropped tiemzones. export PYARROW_IGNORE_TIMEZONE=1 -if [ "${SPARK_VERSION:0:2}" == "2." ]; then +if [ "${SPARK_VERSION:1:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x export ARROW_PRE_0_15_IPC_FORMAT=1 fi @@ -73,14 +73,35 @@ pushd ${spark_dir} # Run pyarrow related Python tests only spark_python_tests=( - "pyspark.sql.tests.test_arrow" - "pyspark.sql.tests.test_pandas_map" - "pyspark.sql.tests.test_pandas_cogrouped_map" - "pyspark.sql.tests.test_pandas_grouped_map" - "pyspark.sql.tests.test_pandas_udf" - "pyspark.sql.tests.test_pandas_udf_scalar" - "pyspark.sql.tests.test_pandas_udf_grouped_agg" - "pyspark.sql.tests.test_pandas_udf_window") + "pyspark.sql.tests.test_arrow") + + case "${SPARK_VERSION}" in + v1.*|v2.*|v3.0.*|v3.1.*|v3.2.*|v3.3.*) + old_test_modules=true + ;; + *) + old_test_modules=false + ;; + esac + if [ "${old_test_modules}" == "true" ]; then + spark_python_tests+=( + "pyspark.sql.tests.test_pandas_grouped_map" + "pyspark.sql.tests.test_pandas_map" + "pyspark.sql.tests.test_pandas_cogrouped_map" + "pyspark.sql.tests.test_pandas_udf" + "pyspark.sql.tests.test_pandas_udf_scalar" + "pyspark.sql.tests.test_pandas_udf_grouped_agg" + "pyspark.sql.tests.test_pandas_udf_window") + else + spark_python_tests+=( + "pyspark.sql.tests.pandas.test_pandas_grouped_map" + "pyspark.sql.tests.pandas.test_pandas_map" + "pyspark.sql.tests.pandas.test_pandas_cogrouped_map" + "pyspark.sql.tests.pandas.test_pandas_udf" + "pyspark.sql.tests.pandas.test_pandas_udf_scalar" + "pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg" + "pyspark.sql.tests.pandas.test_pandas_udf_window") + fi (echo "Testing PySpark:"; IFS=$'\n'; echo "${spark_python_tests[*]}") python/run-tests --testnames "$(IFS=,; echo "${spark_python_tests[*]}")" --python-executables python diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index b0362868b0a..3ea6fe28d63 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -18,10 +18,13 @@ set -ex +if [[ "${ARROW_JAVA_BUILD:-ON}" != "ON" ]]; then + exit +fi + arrow_dir=${1} source_dir=${1}/java build_dir=${2} -cpp_build_dir=${build_dir}/cpp/${ARROW_BUILD_TYPE:-debug} java_jni_dist_dir=${3} : ${BUILD_DOCS_JAVA:=OFF} @@ -64,6 +67,11 @@ if [[ "$(uname -s)" == "Linux" ]] && [[ "$(uname -m)" == "s390x" ]]; then fi mvn="mvn -B -DskipTests -Drat.skip=true -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + +if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then + mvn="${mvn} -Dmaven.gitcommitid.skip=true" +fi + # Use `2 * ncores` threads mvn="${mvn} -T 2C" @@ -79,13 +87,13 @@ if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then ${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data install fi -if [ "${ARROW_GANDIVA_JAVA}" = "ON" ]; then - ${mvn} -Darrow.cpp.build.dir=${cpp_build_dir} -Parrow-jni install +if [ "${ARROW_JAVA_JNI}" = "ON" ]; then + ${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni install fi if [ "${ARROW_PLASMA}" = "ON" ]; then pushd ${source_dir}/plasma - ${mvn} clean install + ${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} clean install popd fi diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh index 1c07971bcc6..2734f3e9dbe 100755 --- a/ci/scripts/java_full_build.sh +++ b/ci/scripts/java_full_build.sh @@ -65,7 +65,13 @@ find . \ -exec echo {} ";" \ -exec cp {} $dist_dir ";" find ~/.m2/repository/org/apache/arrow \ - "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ + "(" \ + -name "*.jar" -o \ + -name "*.json" -o \ + -name "*.pom" -o \ + -name "*.xml" -o \ + -name "*.zip" \ + ")" \ -exec echo {} ";" \ -exec cp {} $dist_dir ";" diff --git a/ci/scripts/java_gandiva_build.sh b/ci/scripts/java_gandiva_build.sh new file mode 100644 index 00000000000..9cbcc9e68b0 --- /dev/null +++ b/ci/scripts/java_gandiva_build.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +dist_dir=${2} + +export ARROW_TEST_DATA=${arrow_dir}/testing/data + +pushd ${arrow_dir}/java + +# Ensure that there is no old jar +# inside the maven repository +maven_repo=~/.m2/repository/org/apache/arrow +if [ -d $maven_repo ]; then + find $maven_repo \ + "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ + -exec echo {} ";" \ + -exec rm -rf {} ";" +fi + +# generate dummy GPG key for -Papache-release. +# -Papache-release generates signs (*.asc) of artifacts. +# We don't use these signs in our release process. +(echo "Key-Type: RSA"; \ + echo "Key-Length: 4096"; \ + echo "Name-Real: Build"; \ + echo "Name-Email: build@example.com"; \ + echo "%no-protection") | \ + gpg --gen-key --batch + +# build the entire project +mvn clean \ + install \ + assembly:single \ + source:jar \ + javadoc:jar \ + -DskipTests \ + -Papache-release \ + -Parrow-c-data \ + -Parrow-jni \ + -Darrow.cpp.build.dir=$dist_dir \ + -Darrow.c.jni.dist.dir=$dist_dir \ + -DdescriptorId=source-release \ + --projects gandiva \ + --also-make + +# copy all jar, zip and pom files to the distribution folder +find . \ + "(" -name "*-javadoc.jar" -o -name "*-sources.jar" ")" \ + -exec echo {} ";" \ + -exec cp {} $dist_dir ";" +find ~/.m2/repository/org/apache/arrow \ + "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ + -exec echo {} ";" \ + -exec cp {} $dist_dir ";" + +popd diff --git a/ci/scripts/java_jni_build.sh b/ci/scripts/java_jni_build.sh index 0f19e614133..53838ba77c6 100755 --- a/ci/scripts/java_jni_build.sh +++ b/ci/scripts/java_jni_build.sh @@ -20,9 +20,12 @@ set -ex arrow_dir=${1} -build_dir=${2}/java_jni +arrow_install_dir=${2} +build_dir=${3}/java_jni # The directory where the final binaries will be stored when scripts finish -dist_dir=${3} +dist_dir=${4} + +prefix_dir="${build_dir}/java-jni" echo "=== Clear output directories and leftovers ===" # Clear output directories and leftovers @@ -32,11 +35,49 @@ echo "=== Building Arrow Java C Data Interface native library ===" mkdir -p "${build_dir}" pushd "${build_dir}" +case "$(uname)" in + Linux) + n_jobs=$(nproc) + ;; + Darwin) + n_jobs=$(sysctl -n hw.ncpu) + ;; + *) + n_jobs=${NPROC:-1} + ;; +esac + +: ${ARROW_JAVA_BUILD_TESTS:=${ARROW_BUILD_TESTS:-OFF}} +: ${CMAKE_BUILD_TYPE:=release} cmake \ - -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-release} \ - -DCMAKE_INSTALL_PREFIX=${dist_dir} \ + -DARROW_JAVA_JNI_ENABLE_DATASET=${ARROW_DATASET:-OFF} \ + -DARROW_JAVA_JNI_ENABLE_GANDIVA=${ARROW_GANDIVA:-OFF} \ + -DARROW_JAVA_JNI_ENABLE_ORC=${ARROW_ORC:-OFF} \ + -DARROW_JAVA_JNI_ENABLE_PLASMA=${ARROW_PLASMA:-OFF} \ + -DBUILD_TESTING=${ARROW_JAVA_BUILD_TESTS} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_PREFIX_PATH=${arrow_install_dir} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${prefix_dir} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -GNinja \ ${JAVA_JNI_CMAKE_ARGS:-} \ ${arrow_dir}/java -cmake --build . --target install --config ${ARROW_BUILD_TYPE:-release} +export CMAKE_BUILD_PARALLEL_LEVEL=${n_jobs} +cmake --build . --config ${CMAKE_BUILD_TYPE} +if [ "${ARROW_JAVA_BUILD_TESTS}" = "ON" ]; then + ctest \ + --output-on-failure \ + --parallel ${n_jobs} \ + --timeout 300 +fi +cmake --build . --config ${CMAKE_BUILD_TYPE} --target install popd + +mkdir -p ${dist_dir} +# For Windows. *.dll are installed into bin/ on Windows. +if [ -d "${prefix_dir}/bin" ]; then + mv ${prefix_dir}/bin/* ${dist_dir}/ +else + mv ${prefix_dir}/lib/* ${dist_dir}/ +fi diff --git a/ci/scripts/java_jni_macos_build.sh b/ci/scripts/java_jni_macos_build.sh index 590c469e398..187de0c6037 100755 --- a/ci/scripts/java_jni_macos_build.sh +++ b/ci/scripts/java_jni_macos_build.sh @@ -21,28 +21,44 @@ set -ex arrow_dir=${1} build_dir=${2} +normalized_arch=$(arch) +case ${normalized_arch} in + arm64) + normalized_arch=aarch_64 + ;; + i386) + normalized_arch=x86_64 + ;; +esac # The directory where the final binaries will be stored when scripts finish -dist_dir=${3} +dist_dir=${3}/${normalized_arch} echo "=== Clear output directories and leftovers ===" # Clear output directories and leftovers rm -rf ${build_dir} echo "=== Building Arrow C++ libraries ===" -: ${ARROW_BUILD_TESTS:=OFF} +install_dir=${build_dir}/cpp-install +: ${ARROW_BUILD_TESTS:=ON} : ${ARROW_DATASET:=ON} -: ${ARROW_FILESYSTEM:=ON} -: ${ARROW_GANDIVA_JAVA:=ON} +export ARROW_DATASET : ${ARROW_GANDIVA:=ON} +export ARROW_GANDIVA : ${ARROW_ORC:=ON} +export ARROW_ORC : ${ARROW_PARQUET:=ON} -: ${ARROW_PLASMA_JAVA_CLIENT:=ON} : ${ARROW_PLASMA:=ON} -: ${ARROW_PYTHON:=OFF} +export ARROW_PLASMA : ${ARROW_S3:=ON} +: ${ARROW_USE_CCACHE:=OFF} : ${CMAKE_BUILD_TYPE:=Release} : ${CMAKE_UNITY_BUILD:=ON} +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics before build ===" + ccache -s +fi + export ARROW_TEST_DATA="${arrow_dir}/testing/data" export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" export AWS_EC2_METADATA_DISABLED=TRUE @@ -51,37 +67,24 @@ mkdir -p "${build_dir}/cpp" pushd "${build_dir}/cpp" cmake \ - -DARROW_BOOST_USE_SHARED=OFF \ - -DARROW_BROTLI_USE_SHARED=OFF \ + -DARROW_BUILD_SHARED=OFF \ -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ - -DARROW_BUILD_UTILITIES=OFF \ - -DARROW_BZ2_USE_SHARED=OFF \ + -DARROW_CSV=${ARROW_DATASET} \ -DARROW_DATASET=${ARROW_DATASET} \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ - -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ - -DARROW_GFLAGS_USE_SHARED=OFF \ - -DARROW_GRPC_USE_SHARED=OFF \ - -DARROW_JNI=ON \ - -DARROW_LZ4_USE_SHARED=OFF \ - -DARROW_OPENSSL_USE_SHARED=OFF \ -DARROW_ORC=${ARROW_ORC} \ -DARROW_PARQUET=${ARROW_PARQUET} \ -DARROW_PLASMA=${ARROW_PLASMA} \ - -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \ - -DARROW_PROTOBUF_USE_SHARED=OFF \ - -DARROW_PYTHON=${ARROW_PYTHON} \ -DARROW_S3=${ARROW_S3} \ - -DARROW_SNAPPY_USE_SHARED=OFF \ - -DARROW_THRIFT_USE_SHARED=OFF \ - -DARROW_UTF8PROC_USE_SHARED=OFF \ - -DARROW_ZSTD_USE_SHARED=OFF \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ -DAWSSDK_SOURCE=BUNDLED \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_INSTALL_PREFIX=${build_dir}/cpp \ + -DCMAKE_INSTALL_PREFIX=${install_dir} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DGTest_SOURCE=BUNDLED \ -DPARQUET_BUILD_EXAMPLES=OFF \ -DPARQUET_BUILD_EXECUTABLES=OFF \ -DPARQUET_REQUIRE_ENCRYPTION=OFF \ @@ -91,7 +94,16 @@ cmake \ cmake --build . --target install if [ "${ARROW_BUILD_TESTS}" == "ON" ]; then - ctest + # MinIO is required + exclude_tests="arrow-s3fs-test" + # unstable + exclude_tests="${exclude_tests}|arrow-compute-hash-join-node-test" + ctest \ + --exclude-regex "${exclude_tests}" \ + --label-regex unittest \ + --output-on-failure \ + --parallel $(sysctl -n hw.ncpu) \ + --timeout 300 fi popd @@ -99,18 +111,17 @@ popd ${arrow_dir}/ci/scripts/java_jni_build.sh \ ${arrow_dir} \ + ${install_dir} \ ${build_dir} \ ${dist_dir} +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics after build ===" + ccache -s +fi -echo "=== Copying libraries to the distribution folder ===" -mkdir -p "${dist_dir}" -cp -L ${build_dir}/cpp/lib/libgandiva_jni.dylib ${dist_dir} -cp -L ${build_dir}/cpp/lib/libarrow_dataset_jni.dylib ${dist_dir} -cp -L ${build_dir}/cpp/lib/libarrow_orc_jni.dylib ${dist_dir} echo "=== Checking shared dependencies for libraries ===" - pushd ${dist_dir} archery linking check-dependencies \ --allow CoreFoundation \ @@ -122,9 +133,13 @@ archery linking check-dependencies \ --allow libcurl \ --allow libgandiva_jni \ --allow libncurses \ + --allow libobjc \ + --allow libplasma_java \ --allow libz \ + --allow libzstd \ libarrow_cdata_jni.dylib \ libarrow_dataset_jni.dylib \ libarrow_orc_jni.dylib \ - libgandiva_jni.dylib + libgandiva_jni.dylib \ + libplasma_java.dylib popd diff --git a/ci/scripts/java_jni_manylinux_aarch64_build.sh b/ci/scripts/java_jni_manylinux_aarch64_build.sh new file mode 100644 index 00000000000..b5c28f42685 --- /dev/null +++ b/ci/scripts/java_jni_manylinux_aarch64_build.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +build_dir=${2} +# The directory where the final binaries will be stored when scripts finish +dist_dir=${3}/$(arch) + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir} + +echo "=== Building Arrow C++ libraries ===" +devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \ + grep -o "^[0-9]*") +devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" +: ${ARROW_DATASET:=ON} +: ${ARROW_GANDIVA:=ON} +: ${ARROW_GANDIVA_JAVA:=ON} +: ${ARROW_FILESYSTEM:=ON} +: ${ARROW_JEMALLOC:=ON} +: ${ARROW_RPATH_ORIGIN:=ON} +: ${ARROW_ORC:=ON} +: ${ARROW_PARQUET:=ON} +: ${ARROW_PLASMA:=ON} +: ${ARROW_PLASMA_JAVA_CLIENT:=ON} +: ${ARROW_PYTHON:=OFF} +: ${ARROW_S3:=ON} +: ${ARROW_BUILD_TESTS:=OFF} +: ${CMAKE_BUILD_TYPE:=Release} +: ${CMAKE_UNITY_BUILD:=ON} +: ${VCPKG_ROOT:=/opt/vcpkg} +: ${VCPKG_FEATURE_FLAGS:=-manifests} +: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} +: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/aarch64-redhat-linux;-isystem;-lpthread} + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}/cpp" +pushd "${build_dir}/cpp" + +cmake \ + -DARROW_BOOST_USE_SHARED=OFF \ + -DARROW_BROTLI_USE_SHARED=OFF \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BZ2_USE_SHARED=OFF \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \ + -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \ + -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_GRPC_USE_SHARED=OFF \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_JNI=ON \ + -DARROW_LZ4_USE_SHARED=OFF \ + -DARROW_OPENSSL_USE_SHARED=OFF \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \ + -DARROW_PLASMA=${ARROW_PLASMA} \ + -DARROW_PROTOBUF_USE_SHARED=OFF \ + -DARROW_PYTHON=${ARROW_PYTHON} \ + -DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \ + -DARROW_S3=${ARROW_S3} \ + -DARROW_SNAPPY_USE_SHARED=OFF \ + -DARROW_THRIFT_USE_SHARED=OFF \ + -DARROW_UTF8PROC_USE_SHARED=OFF \ + -DARROW_ZSTD_USE_SHARED=OFF \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${build_dir}/cpp \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DORC_SOURCE=BUNDLED \ + -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ + -DPARQUET_BUILD_EXAMPLES=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_REQUIRE_ENCRYPTION=OFF \ + -DPythonInterp_FIND_VERSION_MAJOR=3 \ + -DPythonInterp_FIND_VERSION=ON \ + -DVCPKG_MANIFEST_MODE=OFF \ + -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -GNinja \ + ${arrow_dir}/cpp +ninja install + +if [ $ARROW_BUILD_TESTS = "ON" ]; then + ctest \ + --label-regex unittest \ + --output-on-failure \ + --parallel $(nproc) \ + --timeout 300 +fi + +popd + + +JAVA_JNI_CMAKE_ARGS="" +JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_MANIFEST_MODE=OFF" +JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET}" +export JAVA_JNI_CMAKE_ARGS +bash ${arrow_dir}/ci/scripts/java_jni_build.sh \ + ${arrow_dir} \ + ${build_dir} \ + ${dist_dir} + + +echo "=== Copying libraries to the distribution folder ===" +cp -L ${build_dir}/cpp/lib/libgandiva_jni.so ${dist_dir} +cp -L ${build_dir}/cpp/lib/libarrow_dataset_jni.so ${dist_dir} +cp -L ${build_dir}/cpp/lib/libarrow_orc_jni.so ${dist_dir} diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index 008f19140ee..051ebe41d0d 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -22,7 +22,7 @@ set -ex arrow_dir=${1} build_dir=${2} # The directory where the final binaries will be stored when scripts finish -dist_dir=${3} +dist_dir=${3}/$(arch) echo "=== Clear output directories and leftovers ===" # Clear output directories and leftovers @@ -32,26 +32,32 @@ echo "=== Building Arrow C++ libraries ===" devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \ grep -o "^[0-9]*") devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" +: ${ARROW_BUILD_TESTS:=ON} : ${ARROW_DATASET:=ON} +export ARROW_DATASET : ${ARROW_GANDIVA:=ON} -: ${ARROW_GANDIVA_JAVA:=ON} -: ${ARROW_FILESYSTEM:=ON} +export ARROW_GANDIVA : ${ARROW_JEMALLOC:=ON} : ${ARROW_RPATH_ORIGIN:=ON} : ${ARROW_ORC:=ON} +export ARROW_ORC : ${ARROW_PARQUET:=ON} : ${ARROW_PLASMA:=ON} -: ${ARROW_PLASMA_JAVA_CLIENT:=ON} -: ${ARROW_PYTHON:=OFF} +export ARROW_PLASMA : ${ARROW_S3:=ON} -: ${ARROW_BUILD_TESTS:=OFF} -: ${CMAKE_BUILD_TYPE:=Release} +: ${ARROW_USE_CCACHE:=OFF} +: ${CMAKE_BUILD_TYPE:=release} : ${CMAKE_UNITY_BUILD:=ON} : ${VCPKG_ROOT:=/opt/vcpkg} : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} : ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-isystem;-lpthread} +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics before build ===" + ccache -s +fi + export ARROW_TEST_DATA="${arrow_dir}/testing/data" export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" export AWS_EC2_METADATA_DISABLED=TRUE @@ -60,54 +66,47 @@ mkdir -p "${build_dir}/cpp" pushd "${build_dir}/cpp" cmake \ - -DARROW_BOOST_USE_SHARED=OFF \ - -DARROW_BROTLI_USE_SHARED=OFF \ - -DARROW_BUILD_SHARED=ON \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ - -DARROW_BUILD_UTILITIES=OFF \ - -DARROW_BZ2_USE_SHARED=OFF \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_CSV=${ARROW_DATASET} \ -DARROW_DATASET=${ARROW_DATASET} \ -DARROW_DEPENDENCY_SOURCE="VCPKG" \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \ - -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ -DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ - -DARROW_GRPC_USE_SHARED=OFF \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ - -DARROW_JNI=ON \ - -DARROW_LZ4_USE_SHARED=OFF \ - -DARROW_OPENSSL_USE_SHARED=OFF \ -DARROW_ORC=${ARROW_ORC} \ -DARROW_PARQUET=${ARROW_PARQUET} \ - -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \ -DARROW_PLASMA=${ARROW_PLASMA} \ - -DARROW_PROTOBUF_USE_SHARED=OFF \ - -DARROW_PYTHON=${ARROW_PYTHON} \ -DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \ -DARROW_S3=${ARROW_S3} \ - -DARROW_SNAPPY_USE_SHARED=OFF \ - -DARROW_THRIFT_USE_SHARED=OFF \ - -DARROW_UTF8PROC_USE_SHARED=OFF \ - -DARROW_ZSTD_USE_SHARED=OFF \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_INSTALL_PREFIX=${build_dir}/cpp \ + -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DGTest_SOURCE=BUNDLED \ -DORC_SOURCE=BUNDLED \ -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DPARQUET_BUILD_EXAMPLES=OFF \ -DPARQUET_BUILD_EXECUTABLES=OFF \ -DPARQUET_REQUIRE_ENCRYPTION=OFF \ - -DPythonInterp_FIND_VERSION_MAJOR=3 \ - -DPythonInterp_FIND_VERSION=ON \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ -GNinja \ ${arrow_dir}/cpp ninja install -if [ $ARROW_BUILD_TESTS = "ON" ]; then +if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then + # MinIO is required + exclude_tests="arrow-s3fs-test" + # unstable + exclude_tests="${exclude_tests}|arrow-compute-hash-join-node-test" + exclude_tests="${exclude_tests}|arrow-dataset-scanner-test" + # strptime + exclude_tests="${exclude_tests}|arrow-utility-test" ctest \ + --exclude-regex "${exclude_tests}" \ --label-regex unittest \ --output-on-failure \ --parallel $(nproc) \ @@ -118,22 +117,22 @@ popd JAVA_JNI_CMAKE_ARGS="" -JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_MANIFEST_MODE=OFF" +JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET}" export JAVA_JNI_CMAKE_ARGS ${arrow_dir}/ci/scripts/java_jni_build.sh \ ${arrow_dir} \ + ${ARROW_HOME} \ ${build_dir} \ ${dist_dir} +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics after build ===" + ccache -s +fi -echo "=== Copying libraries to the distribution folder ===" -cp -L ${build_dir}/cpp/lib/libgandiva_jni.so ${dist_dir} -cp -L ${build_dir}/cpp/lib/libarrow_dataset_jni.so ${dist_dir} -cp -L ${build_dir}/cpp/lib/libarrow_orc_jni.so ${dist_dir} echo "=== Checking shared dependencies for libraries ===" - pushd ${dist_dir} archery linking check-dependencies \ --allow ld-linux-x86-64 \ @@ -149,5 +148,6 @@ archery linking check-dependencies \ libarrow_cdata_jni.so \ libarrow_dataset_jni.so \ libarrow_orc_jni.so \ - libgandiva_jni.so + libgandiva_jni.so \ + libplasma_java.so popd diff --git a/ci/scripts/java_jni_windows_build.sh b/ci/scripts/java_jni_windows_build.sh new file mode 100755 index 00000000000..ce445db578f --- /dev/null +++ b/ci/scripts/java_jni_windows_build.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +build_dir=${2} +# The directory where the final binaries will be stored when scripts finish +dist_dir=${3}/x86_64 + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir} + +echo "=== Building Arrow C++ libraries ===" +install_dir=${build_dir}/cpp-install +: ${ARROW_BUILD_TESTS:=ON} +: ${ARROW_DATASET:=ON} +export ARROW_DATASET +: ${ARROW_ORC:=ON} +export ARROW_ORC +: ${ARROW_PARQUET:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_USE_CCACHE:=OFF} +: ${CMAKE_BUILD_TYPE:=release} +: ${CMAKE_UNITY_BUILD:=ON} + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics before build ===" + ccache -s +fi + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}/cpp" +pushd "${build_dir}/cpp" + +cmake \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_CSV=${ARROW_DATASET} \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_S3=${ARROW_S3} \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ + -DARROW_WITH_BROTLI=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZSTD=ON \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${install_dir} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -GNinja \ + ${arrow_dir}/cpp +ninja install + +if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then + # MinIO is required + exclude_tests="arrow-s3fs-test" + # unstable + exclude_tests="${exclude_tests}|arrow-compute-hash-join-node-test" + exclude_tests="${exclude_tests}|arrow-dataset-scanner-test" + # strptime + exclude_tests="${exclude_tests}|arrow-utility-test" + ctest \ + --exclude-regex "${exclude_tests}" \ + --label-regex unittest \ + --output-on-failure \ + --parallel $(nproc) \ + --timeout 300 +fi + +popd + + +${arrow_dir}/ci/scripts/java_jni_build.sh \ + ${arrow_dir} \ + ${install_dir} \ + ${build_dir} \ + ${dist_dir} + +if [ "${ARROW_USE_CCACHE}" == "ON" ]; then + echo "=== ccache statistics after build ===" + ccache -s +fi + + +echo "=== Checking shared dependencies for libraries ===" +pushd ${dist_dir} +# TODO +# archery linking check-dependencies \ +# --allow libm \ +# --allow librt \ +# --allow libz \ +# libarrow_cdata_jni.dll \ +# libarrow_dataset_jni.dll \ +popd diff --git a/ci/scripts/java_test.sh b/ci/scripts/java_test.sh index 83ef26fdb1a..46577b69625 100755 --- a/ci/scripts/java_test.sh +++ b/ci/scripts/java_test.sh @@ -18,9 +18,12 @@ set -ex +if [[ "${ARROW_JAVA_TEST:-ON}" != "ON" ]]; then + exit +fi + arrow_dir=${1} source_dir=${1}/java -cpp_build_dir=${2}/cpp/${ARROW_BUILD_TYPE:-debug} java_jni_dist_dir=${3} # For JNI and Plasma tests @@ -35,20 +38,36 @@ pushd ${source_dir} ${mvn} test -if [ "${ARROW_JNI}" = "ON" ]; then - ${mvn} test -Parrow-jni -pl adapter/orc,gandiva,dataset -Darrow.cpp.build.dir=${cpp_build_dir} +projects=() +if [ "${ARROW_DATASET}" = "ON" ]; then + projects+=(gandiva) +fi +if [ "${ARROW_GANDIVA}" = "ON" ]; then + projects+=(gandiva) +fi +if [ "${ARROW_ORC}" = "ON" ]; then + projects+=(adapter/orc) +fi +if [ "${ARROW_PLASMA}" = "ON" ]; then + projects+=(plasma) fi +if [ "${#projects[@]}" -gt 0 ]; then + ${mvn} test \ + -Parrow-jni \ + -pl $(IFS=,; echo "${projects[*]}") \ + -Darrow.cpp.build.dir=${java_jni_dist_dir} -if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then - ${mvn} test -Parrow-c-data -pl c -Darrow.c.jni.dist.dir=${java_jni_dist_dir} + if [ "${ARROW_PLASMA}" = "ON" ]; then + pushd ${source_dir}/plasma + java -cp target/test-classes:target/classes \ + -Djava.library.path=${java_jni_dist_dir}/$(arch) \ + org.apache.arrow.plasma.PlasmaClientTest + popd + fi fi -if [ "${ARROW_PLASMA}" = "ON" ]; then - pushd ${source_dir}/plasma - java -cp target/test-classes:target/classes \ - -Djava.library.path=${cpp_build_dir} \ - org.apache.arrow.plasma.PlasmaClientTest - popd +if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then + ${mvn} test -Parrow-c-data -pl c -Darrow.c.jni.dist.dir=${java_jni_dist_dir} fi popd diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index fba0fa26045..60c77499b9a 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -27,29 +27,31 @@ case "${target}" in packages+=(${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp) packages+=(${MINGW_PACKAGE_PREFIX}-boost) packages+=(${MINGW_PACKAGE_PREFIX}-brotli) + packages+=(${MINGW_PACKAGE_PREFIX}-bzip2) + packages+=(${MINGW_PACKAGE_PREFIX}-c-ares) + packages+=(${MINGW_PACKAGE_PREFIX}-cc) packages+=(${MINGW_PACKAGE_PREFIX}-ccache) packages+=(${MINGW_PACKAGE_PREFIX}-clang) packages+=(${MINGW_PACKAGE_PREFIX}-cmake) - packages+=(${MINGW_PACKAGE_PREFIX}-gcc) + packages+=(${MINGW_PACKAGE_PREFIX}-double-conversion) + packages+=(${MINGW_PACKAGE_PREFIX}-flatbuffers) packages+=(${MINGW_PACKAGE_PREFIX}-gflags) packages+=(${MINGW_PACKAGE_PREFIX}-grpc) packages+=(${MINGW_PACKAGE_PREFIX}-gtest) packages+=(${MINGW_PACKAGE_PREFIX}-libutf8proc) packages+=(${MINGW_PACKAGE_PREFIX}-libxml2) packages+=(${MINGW_PACKAGE_PREFIX}-lz4) - packages+=(${MINGW_PACKAGE_PREFIX}-make) packages+=(${MINGW_PACKAGE_PREFIX}-ninja) packages+=(${MINGW_PACKAGE_PREFIX}-nlohmann-json) + packages+=(${MINGW_PACKAGE_PREFIX}-openssl) packages+=(${MINGW_PACKAGE_PREFIX}-protobuf) - packages+=(${MINGW_PACKAGE_PREFIX}-python-cffi) - packages+=(${MINGW_PACKAGE_PREFIX}-python-numpy) - packages+=(${MINGW_PACKAGE_PREFIX}-python-pip) - packages+=(${MINGW_PACKAGE_PREFIX}-python-wheel) packages+=(${MINGW_PACKAGE_PREFIX}-rapidjson) packages+=(${MINGW_PACKAGE_PREFIX}-re2) packages+=(${MINGW_PACKAGE_PREFIX}-snappy) + packages+=(${MINGW_PACKAGE_PREFIX}-sqlite3) packages+=(${MINGW_PACKAGE_PREFIX}-thrift) packages+=(${MINGW_PACKAGE_PREFIX}-xsimd) + packages+=(${MINGW_PACKAGE_PREFIX}-uriparser) packages+=(${MINGW_PACKAGE_PREFIX}-zlib) packages+=(${MINGW_PACKAGE_PREFIX}-zstd) ;; diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 4e2990b84d6..2d5bd5dd9ff 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -20,11 +20,13 @@ set -ex arrow_dir=${1} +test_dir=${1}/python/build/dist export ARROW_SOURCE_DIR=${arrow_dir} export ARROW_TEST_DATA=${arrow_dir}/testing/data export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}} export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py # Enable some checks inside Python itself @@ -54,4 +56,5 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 -pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow +# Testing PyArrow +pytest -r s ${PYTEST_ARGS} --pyargs pyarrow diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index a6e763b6523..7c7ef7745c0 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -34,15 +34,13 @@ rm -rf ${source_dir}/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ===" export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}" -export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.9} +export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.14} export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)} if [ $arch = "arm64" ]; then export CMAKE_OSX_ARCHITECTURES="arm64" elif [ $arch = "x86_64" ]; then export CMAKE_OSX_ARCHITECTURES="x86_64" -elif [ $arch = "universal2" ]; then - export CMAKE_OSX_ARCHITECTURES="x86_64;arm64" else echo "Unexpected architecture: $arch" exit 1 @@ -58,7 +56,7 @@ pip install \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ -r ${source_dir}/python/requirements-wheel-build.txt -pip install "delocate>=0.9" +pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} @@ -96,25 +94,27 @@ cmake \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ + -DARROW_COMPUTE=ON \ + -DARROW_CSV=ON \ -DARROW_DATASET=${ARROW_DATASET} \ -DARROW_DEPENDENCY_SOURCE="VCPKG" \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_FILESYSTEM=ON \ -DARROW_FLIGHT=${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ -DARROW_GCS=${ARROW_GCS} \ -DARROW_HDFS=${ARROW_HDFS} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_JSON=ON \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ -DARROW_ORC=${ARROW_ORC} \ -DARROW_PACKAGE_KIND="python-wheel-macos" \ -DARROW_PARQUET=${ARROW_PARQUET} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DARROW_PLASMA=${ARROW_PLASMA} \ - -DARROW_PYTHON=ON \ -DARROW_RPATH_ORIGIN=ON \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_S3=${ARROW_S3} \ -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ -DARROW_USE_CCACHE=ON \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ @@ -129,9 +129,9 @@ cmake \ -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DOPENSSL_USE_STATIC_LIBS=ON \ -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ -G ${CMAKE_GENERATOR} \ @@ -156,8 +156,9 @@ export PYARROW_WITH_PLASMA=${ARROW_PLASMA} export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT} export PYARROW_WITH_S3=${ARROW_S3} export PYARROW_CMAKE_OPTIONS="-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL}" +export ARROW_HOME=${build_dir}/install # PyArrow build configuration -export PKG_CONFIG_PATH=/usr/lib/pkgconfig:${build_dir}/install/lib/pkgconfig +export CMAKE_PREFIX_PATH=${build_dir}/install # Set PyArrow version explicitly export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index af17606199e..2aea55ed70f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -85,45 +85,49 @@ fi mkdir /tmp/arrow-build pushd /tmp/arrow-build +# ARROW-17501: We can remove -DAWSSDK_SOURCE=BUNDLED once +# https://github.com/aws/aws-sdk-cpp/issues/1809 is fixed and vcpkg +# ships the fix. cmake \ - -DARROW_BROTLI_USE_SHARED=OFF \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ + -DARROW_COMPUTE=ON \ + -DARROW_CSV=ON \ -DARROW_DATASET=${ARROW_DATASET} \ -DARROW_DEPENDENCY_SOURCE="VCPKG" \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_FILESYSTEM=ON \ -DARROW_FLIGHT=${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ -DARROW_GCS=${ARROW_GCS} \ -DARROW_HDFS=${ARROW_HDFS} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_JSON=ON \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ -DARROW_ORC=${ARROW_ORC} \ -DARROW_PACKAGE_KIND="python-wheel-manylinux${MANYLINUX_VERSION}" \ -DARROW_PARQUET=${ARROW_PARQUET} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DARROW_PLASMA=${ARROW_PLASMA} \ - -DARROW_PYTHON=ON \ -DARROW_RPATH_ORIGIN=ON \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_S3=${ARROW_S3} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ -DARROW_USE_CCACHE=ON \ - -DARROW_UTF8PROC_USE_SHARED=OFF \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \ -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \ + -DAWSSDK_SOURCE=BUNDLED \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DOPENSSL_USE_STATIC_LIBS=ON \ -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ ${ARROW_EXTRA_CMAKE_FLAGS} \ @@ -151,8 +155,9 @@ export PYARROW_WITH_PARQUET_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} export PYARROW_WITH_PLASMA=${ARROW_PLASMA} export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT} export PYARROW_WITH_S3=${ARROW_S3} +export ARROW_HOME=/tmp/arrow-dist # PyArrow build configuration -export PKG_CONFIG_PATH=/usr/lib/pkgconfig:/tmp/arrow-dist/lib/pkgconfig +export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python python setup.py bdist_wheel diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index fb776185a5f..d137cd8a985 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -62,21 +62,23 @@ cmake ^ -DARROW_BUILD_SHARED=ON ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=OFF ^ + -DARROW_COMPUTE=ON ^ + -DARROW_CSV=ON ^ -DARROW_CXXFLAGS="/MP" ^ -DARROW_DATASET=%ARROW_DATASET% ^ -DARROW_DEPENDENCY_SOURCE=VCPKG ^ -DARROW_DEPENDENCY_USE_SHARED=OFF ^ + -DARROW_FILESYSTEM=ON ^ -DARROW_FLIGHT=%ARROW_FLIGHT% ^ -DARROW_GANDIVA=%ARROW_GANDIVA% ^ -DARROW_HDFS=%ARROW_HDFS% ^ + -DARROW_JSON=ON ^ -DARROW_MIMALLOC=%ARROW_MIMALLOC% ^ -DARROW_ORC=%ARROW_ORC% ^ -DARROW_PACKAGE_KIND="python-wheel-windows" ^ -DARROW_PARQUET=%ARROW_PARQUET% ^ - -DPARQUET_REQUIRE_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% ^ - -DARROW_PYTHON=ON ^ - -DARROW_SUBSTRAIT=%ARROW_SUBSTRAIT% ^ -DARROW_S3=%ARROW_S3% ^ + -DARROW_SUBSTRAIT=%ARROW_SUBSTRAIT% ^ -DARROW_TENSORFLOW=%ARROW_TENSORFLOW% ^ -DARROW_WITH_BROTLI=%ARROW_WITH_BROTLI% ^ -DARROW_WITH_BZ2=%ARROW_WITH_BZ2% ^ @@ -90,6 +92,7 @@ cmake ^ -DCMAKE_INSTALL_PREFIX=C:\arrow-dist ^ -DCMAKE_UNITY_BUILD=%CMAKE_UNITY_BUILD% ^ -DMSVC_LINK_VERBOSE=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% ^ -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=%VCGPK_TARGET_TRIPLET% ^ -G "%CMAKE_GENERATOR%" ^ @@ -113,6 +116,7 @@ set PYARROW_WITH_PARQUET_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% set PYARROW_WITH_SUBSTRAIT=%ARROW_SUBSTRAIT% set PYARROW_WITH_S3=%ARROW_S3% set ARROW_HOME=C:\arrow-dist +set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM bundle the msvc runtime diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index 2b7aad3abe9..2abf8ca50fe 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -17,7 +17,7 @@ @echo on -set PYARROW_TEST_CYTHON=OFF +set PYARROW_TEST_CYTHON=ON set PYARROW_TEST_DATASET=ON set PYARROW_TEST_FLIGHT=ON set PYARROW_TEST_GANDIVA=OFF diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 2bc5a4806f5..1cbd5f0b5ea 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -57,7 +57,7 @@ if [ ${R_CUSTOM_CCACHE} = "true" ]; then CCACHE=ccache CC=\$(CCACHE) gcc\$(VER) CXX=\$(CCACHE) g++\$(VER) -CXX11=\$(CCACHE) g++\$(VER)" >> ~/.R/Makevars +CXX17=\$(CCACHE) g++\$(VER)" >> ~/.R/Makevars mkdir -p ~/.ccache/ echo "max_size = 5.0G @@ -67,52 +67,32 @@ sloppiness = include_file_ctime hash_dir = false" >> ~/.ccache/ccache.conf fi - -# Special hacking to try to reproduce quirks on fedora-clang-devel on CRAN -# which uses a bespoke clang compiled to use libc++ -# https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang -if [ "$RHUB_PLATFORM" = "linux-x86_64-fedora-clang" ]; then - dnf install -y libcxx-devel - sed -i.bak -E -e 's/(CXX1?1? =.*)/\1 -stdlib=libc++/g' $(${R_BIN} RHOME)/etc/Makeconf - rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak - - sed -i.bak -E -e 's/(\-std=gnu\+\+)/-std=c++/g' $(${R_BIN} RHOME)/etc/Makeconf - rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak - - sed -i.bak -E -e 's/(CXXFLAGS = )(.*)/\1 -g -O3 -Wall -pedantic -frtti -fPIC/' $(${R_BIN} RHOME)/etc/Makeconf - rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak - - sed -i.bak -E -e 's/(LDFLAGS =.*)/\1 -stdlib=libc++/g' $(${R_BIN} RHOME)/etc/Makeconf - rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak -fi - # Special hacking to try to reproduce quirks on centos using non-default build # tooling. -if [[ "$DEVTOOLSET_VERSION" -gt 0 ]]; then +if [[ -n "$DEVTOOLSET_VERSION" ]]; then $PACKAGE_MANAGER install -y centos-release-scl $PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION" -fi - -if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then - # Install curl and openssl for S3/GCS support - if [ "$PACKAGE_MANAGER" = "apt-get" ]; then - apt-get install -y libcurl4-openssl-dev libssl-dev - else - $PACKAGE_MANAGER install -y libcurl-devel openssl-devel - fi - # The Dockerfile should have put this file here - if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh" ] && [ "`which wget`" ]; then - ${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh latest /usr/local + # Enable devtoolset here so that `which gcc` finds the right compiler below + source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable + + # Build images which require the devtoolset don't have CXX17 variables + # set as the system compiler doesn't support C++17 + if [ ! "`{R_BIN} CMD config CXX17`" ]; then + mkdir -p ~/.R + echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars + echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars + echo "CXX17STD = -std=c++17" >> ~/.R/Makevars + echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars fi +fi - if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh" ] && [ "`which pip`" ]; then - ${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh default - fi +if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" ]; then + "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" fi -# Install rsync for bundling cpp source -$PACKAGE_MANAGER install -y rsync +# Install rsync for bundling cpp source and curl to make sure it is installed on all images +$PACKAGE_MANAGER install -y rsync curl # Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786 Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}' diff --git a/ci/scripts/r_install_system_dependencies.sh b/ci/scripts/r_install_system_dependencies.sh new file mode 100755 index 00000000000..d824c3e81ed --- /dev/null +++ b/ci/scripts/r_install_system_dependencies.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +: ${ARROW_SOURCE_HOME:=/arrow} + +if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then + # Figure out what package manager we have + if [ "`which dnf`" ]; then + PACKAGE_MANAGER=dnf + elif [ "`which yum`" ]; then + PACKAGE_MANAGER=yum + elif [ "`which zypper`" ]; then + PACKAGE_MANAGER=zypper + else + PACKAGE_MANAGER=apt-get + apt-get update + fi + + # Install curl and OpenSSL for S3/GCS support + case "$PACKAGE_MANAGER" in + apt-get) + apt-get install -y libcurl4-openssl-dev libssl-dev + ;; + *) + $PACKAGE_MANAGER install -y libcurl-devel openssl-devel + ;; + esac + + # The Dockerfile should have put this file here + if [ "$ARROW_S3" == "ON" ] && [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh" ] && [ "`which wget`" ]; then + "${ARROW_SOURCE_HOME}/ci/scripts/install_minio.sh" latest /usr/local + fi + + if [ "$ARROW_GCS" == "ON" ] && [ -f "${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh" ]; then + case "$PACKAGE_MANAGER" in + zypper) + # python3 is Python 3.6 on OpenSUSE 15.3. + # PyArrow supports Python 3.7 or later. + $PACKAGE_MANAGER install -y python39-pip + ln -s /usr/bin/python3.9 /usr/local/bin/python + ln -s /usr/bin/pip3.9 /usr/local/bin/pip + ;; + *) + $PACKAGE_MANAGER install -y python3-pip + ln -s /usr/bin/python3 /usr/local/bin/python + ln -s /usr/bin/pip3 /usr/local/bin/pip + ;; + esac + "${ARROW_SOURCE_HOME}/ci/scripts/install_gcs_testbench.sh" default + fi +fi diff --git a/ci/scripts/r_revdepcheck.sh b/ci/scripts/r_revdepcheck.sh index b0a2bab64e3..f7527aed89c 100755 --- a/ci/scripts/r_revdepcheck.sh +++ b/ci/scripts/r_revdepcheck.sh @@ -19,11 +19,45 @@ set -ex : ${R_BIN:=R} - +# When revdep runs with > 1 worker the checks for {targets} time out for +# some reason. +: ${ARROW_REVDEP_WORKERS:=1} +# But we do want to use all cores while building arrow to speed up the +# installation so this is used to set MAKEFLAGS +: ${N_JOBS:=$(nproc)} source_dir=${1}/r # cpp building dependencies -apt install -y cmake +# TODO(assignUser) consolidate cpp system reqs across docker files +apt update -y -q && \ +apt install -y \ + cmake \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgflags-dev \ + libgoogle-glog-dev \ + liblz4-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libradospp-dev \ + libre2-dev \ + libsnappy-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libzstd-dev \ + nlohmann-json3-dev \ + pkg-config \ + protobuf-compiler \ + python3-dev \ + python3-pip \ + python3-rados \ + rados-objclass-dev \ + rapidjson-dev \ + tzdata \ + wget # system dependencies needed for arrow's reverse dependencies apt install -y libxml2-dev \ @@ -42,16 +76,27 @@ apt install -y libxml2-dev \ libgeos-dev \ libproj-dev -pushd ${source_dir} + +# We have to be in source_dir so that cpp source detection works +pushd $source_dir printenv +# copy over cpp source +make sync-cpp + # By default, aws-sdk tries to contact a non-existing local ip host # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE # Set crancache dir so we can cache it -export CRANCACHE_DIR="/arrow/.crancache" +export CRANCACHE_DIR="${1}/.crancache" + +# One of the revdeps/system reqs creates an incomplete boost install +# in the cmake search path which breaks our build, so we don't use system boost +export EXTRA_CMAKE_FLAGS='-DBoost_SOURCE=BUNDLED' + +export MAKEFLAGS=-j$N_JOBS SCRIPT=" # We can't use RSPM binaries because we need source packages @@ -65,13 +110,12 @@ SCRIPT=" revdepcheck::revdep_check( quiet = FALSE, timeout = as.difftime(120, units = 'mins'), - num_workers = 1, + num_workers = $ARROW_REVDEP_WORKERS, env = c( ARROW_R_DEV = '$ARROW_R_DEV', - LIBARROW_DOWNLOAD = TRUE, - LIBARROW_MINIMAL = FALSE, revdepcheck::revdep_env_vars() )) + revdepcheck::revdep_report(all = TRUE) # Go through the summary and fail if any of the statuses include - diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 0328df2384b..d7df44e2e43 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -27,7 +27,7 @@ pushd ${source_dir} printenv # Run the nixlibs.R test suite, which is not included in the installed package - ${R_BIN} -e 'setwd("tools"); testthat::test_dir(".")' +${R_BIN} -e 'setwd("tools"); testthat::test_dir(".")' # Before release, we always copy the relevant parts of the cpp source into the # package. In some CI checks, we will use this version of the source: @@ -77,11 +77,6 @@ export ARROW_DEBUG_MEMORY_POOL=trap export TEXMFCONFIG=/tmp/texmf-config export TEXMFVAR=/tmp/texmf-var -if [[ "$DEVTOOLSET_VERSION" -gt 0 ]]; then - # enable the devtoolset version to use it - source /opt/rh/devtoolset-$DEVTOOLSET_VERSION/enable -fi - # Make sure we aren't writing to the home dir (CRAN _hates_ this but there is no official check) BEFORE=$(ls -alh ~/) @@ -92,14 +87,6 @@ SCRIPT="as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') } else { args <- c('--no-manual', '--ignore-vignettes') build_args <- '--no-build-vignettes' - - if (nzchar(Sys.which('minio'))) { - message('Running minio for S3 tests (if build supports them)') - minio_dir <- tempfile() - dir.create(minio_dir) - pid_minio <- sys::exec_background('minio', c('server', minio_dir)) - on.exit(tools::pskill(pid_minio), add = TRUE) - } } if (requireNamespace('reticulate', quietly = TRUE) && reticulate::py_module_available('pyarrow')) { diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index c361af1d267..c9395eb243f 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -23,26 +23,15 @@ set -ex # Make sure it is absolute and exported export ARROW_HOME="$(cd "${ARROW_HOME}" && pwd)" -if [ "$RTOOLS_VERSION" = "35" ]; then - # Use rtools-backports if building with rtools35 - curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf - pacman --noconfirm -Syy - # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) - RWINLIB_LIB_DIR="lib-4.9.3" - # This is the default (will build for each arch) but we can set up CI to - # do these in parallel - : ${MINGW_ARCH:="mingw32 mingw64"} -else - # Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN - # curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf - # curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" - # pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz - # pacman --noconfirm -Scc - - pacman --noconfirm -Syy - RWINLIB_LIB_DIR="lib" - : ${MINGW_ARCH:="mingw32 mingw64 ucrt64"} -fi +# Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN +# curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf +# curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" +# pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz +# pacman --noconfirm -Scc + +pacman --noconfirm -Syy +RWINLIB_LIB_DIR="lib" +: ${MINGW_ARCH:="mingw32 mingw64 ucrt64"} export MINGW_ARCH @@ -78,26 +67,19 @@ fi if [ -d mingw64/lib/ ]; then ls $MSYS_LIB_DIR/mingw64/lib/ # Make the rest of the directory structure - # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) - mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/x64 - # lib is for the new gcc 8 toolchain (Rtools 4.0) mkdir -p $DST_DIR/lib/x64 # Move the 64-bit versions of libarrow into the expected location - mv mingw64/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 - # These may be from https://dl.bintray.com/rtools/backports/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x64 + mv mingw64/lib/*.a $DST_DIR/lib/x64 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ - cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 + cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/x64 fi # Same for the 32-bit versions if [ -d mingw32/lib/ ]; then ls $MSYS_LIB_DIR/mingw32/lib/ - mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/i386 mkdir -p $DST_DIR/lib/i386 - mv mingw32/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 - cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 + mv mingw32/lib/*.a $DST_DIR/lib/i386 + cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/i386 fi # Do the same also for ucrt64 @@ -105,7 +87,7 @@ if [ -d ucrt64/lib/ ]; then ls $MSYS_LIB_DIR/ucrt64/lib/ mkdir -p $DST_DIR/lib/x64-ucrt mv ucrt64/lib/*.a $DST_DIR/lib/x64-ucrt - cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt + cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*,nghttp2}.a $DST_DIR/lib/x64-ucrt fi # Create build artifact diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index b2eed47466d..c873bfbb06a 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,100 +1,28 @@ -diff --git a/ports/abseil/fix-universal2.patch b/ports/abseil/fix-universal2.patch -new file mode 100644 -index 0000000000..c729e7ae48 ---- /dev/null -+++ b/ports/abseil/fix-universal2.patch -@@ -0,0 +1,55 @@ -+diff --git a/absl/copts/AbseilConfigureCopts.cmake b/absl/copts/AbseilConfigureCopts.cmake -+index 942ce90a4..15d6c895f 100644 -+--- a/absl/copts/AbseilConfigureCopts.cmake -++++ b/absl/copts/AbseilConfigureCopts.cmake -+@@ -12,7 +12,49 @@ else() -+ set(ABSL_BUILD_DLL FALSE) -+ endif() -+ -+-if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64") -++if(APPLE AND CMAKE_CXX_COMPILER_ID MATCHES [[Clang]]) -++ # Some CMake targets (not known at the moment of processing) could be set to -++ # compile for multiple architectures as specified by the OSX_ARCHITECTURES -++ # property, which is target-specific. We should neither inspect nor rely on -++ # any CMake property or variable to detect an architecture, in particular: -++ # -++ # - CMAKE_OSX_ARCHITECTURES -++ # is just an initial value for OSX_ARCHITECTURES; set too early. -++ # -++ # - OSX_ARCHITECTURES -++ # is a per-target property; targets could be defined later, and their -++ # properties could be modified any time later. -++ # -++ # - CMAKE_SYSTEM_PROCESSOR -++ # does not reflect multiple architectures at all. -++ # -++ # When compiling for multiple architectures, a build system can invoke a -++ # compiler either -++ # -++ # - once: a single command line for multiple architectures (Ninja build) -++ # - twice: two command lines per each architecture (Xcode build system) -++ # -++ # If case of Xcode, it would be possible to set an Xcode-specific attributes -++ # like XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS[arch=arm64] or similar. -++ # -++ # In both cases, the viable strategy is to pass all arguments at once, allowing -++ # the compiler to dispatch arch-specific arguments to a designated backend. -++ set(ABSL_RANDOM_RANDEN_COPTS "") -++ foreach(_arch IN ITEMS "x86_64" "arm64") -++ string(TOUPPER "${_arch}" _arch_uppercase) -++ string(REPLACE "X86_64" "X64" _arch_uppercase ${_arch_uppercase}) -++ foreach(_flag IN LISTS ABSL_RANDOM_HWAES_${_arch_uppercase}_FLAGS) -++ list(APPEND ABSL_RANDOM_RANDEN_COPTS "-Xarch_${_arch}" "${_flag}") -++ endforeach() -++ endforeach() -++ # If a compiler happens to deal with an argument for a currently unused -++ # architecture, it will warn about an unused command line argument. -++ option(ABSL_RANDOM_RANDEN_COPTS_WARNING OFF -++ "Warn if one of ABSL_RANDOM_RANDEN_COPTS is unused") -++ if(ABSL_RANDOM_RANDEN_COPTS AND NOT ABSL_RANDOM_RANDEN_COPTS_WARNING) -++ list(APPEND ABSL_RANDOM_RANDEN_COPTS "-Wno-unused-command-line-argument") -++ endif() -++elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64") -+ if (MSVC) -+ set(ABSL_RANDOM_RANDEN_COPTS "${ABSL_RANDOM_HWAES_MSVC_X64_FLAGS}") -+ else() -diff --git a/ports/abseil/portfile.cmake b/ports/abseil/portfile.cmake -index 1289eed36a..b010a69f13 100644 ---- a/ports/abseil/portfile.cmake -+++ b/ports/abseil/portfile.cmake -@@ -15,6 +15,7 @@ vcpkg_from_github( - # detection can cause ABI issues depending on which compiler options - # are enabled for consuming user code - fix-cxx-standard.patch -+ fix-universal2.patch - ) - - vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index f81d0c491d..e5ea9cef57 100644 +index 5a14562..924b1b7 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -88,6 +88,10 @@ vcpkg_cmake_configure( - -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON - -DENABLE_DEBUG=ON +@@ -87,8 +87,11 @@ vcpkg_cmake_configure( + -DENABLE_MANUAL=OFF -DCURL_CA_FALLBACK=ON + -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none -+ OPTIONS_DEBUG -+ ${EXTRA_ARGS_DEBUG} - OPTIONS_RELEASE - ${OPTIONS_RELEASE} OPTIONS_DEBUG + -DENABLE_DEBUG=ON ++ ${EXTRA_ARGS_DEBUG} + ) + vcpkg_cmake_install() + vcpkg_copy_pdbs() diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 45b8c706db..b409d8a7be 100644 +index df95a08..d740ce7 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -4,6 +4,7 @@ vcpkg_from_github( - REF 1.1.9 - SHA512 f1f8a90f5f7f23310423574b1d8c9acb84c66ea620f3999d1060395205e5760883476837aba02f0aa913af60819e34c625d8308c18a5d7a9c4e190f35968b024 +@@ -9,6 +9,7 @@ vcpkg_from_github( HEAD_REF master -+ PATCHES "snappy-disable-bmi.patch" + PATCHES + fix_clang-cl_build.patch ++ "snappy-disable-bmi.patch" ) vcpkg_cmake_configure( @@ -123,3 +51,16 @@ index 0000000000..a57ce0c22f + } + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { +diff --git a/scripts/cmake/vcpkg_find_acquire_program.cmake b/scripts/cmake/vcpkg_find_acquire_program.cmake +index 4611af6..d11936f 100644 +--- a/scripts/cmake/vcpkg_find_acquire_program.cmake ++++ b/scripts/cmake/vcpkg_find_acquire_program.cmake +@@ -239,7 +239,7 @@ function(vcpkg_find_acquire_program program) + set(paths_to_search "${DOWNLOADS}/tools/python/${tool_subdirectory}") + vcpkg_list(SET post_install_command "${CMAKE_COMMAND}" -E rm python310._pth) + else() +- set(program_name python3) ++ set(program_name python) + set(brew_package_name "python") + set(apt_package_name "python3") + endif() diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake index 29e4b0e63c5..580b4604d52 100644 --- a/ci/vcpkg/universal2-osx-static-debug.cmake +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") +set(VCPKG_OSX_DEPLOYMENT_TARGET "10.14") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake index 8111169fab2..7247d0af351 100644 --- a/ci/vcpkg/universal2-osx-static-release.cmake +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") +set(VCPKG_OSX_DEPLOYMENT_TARGET "10.14") set(VCPKG_BUILD_TYPE release) diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index d9d074e99b0..71c23165e61 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -43,6 +43,7 @@ "description": "Development dependencies", "dependencies": [ "benchmark", + "boost-process", "gtest" ] }, diff --git a/cpp/Brewfile b/cpp/Brewfile index 01149cf85fa..66f1bd332bb 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -brew "automake" brew "aws-sdk-cpp" brew "bash" brew "boost" @@ -26,12 +25,11 @@ brew "cmake" brew "flatbuffers" brew "git" brew "glog" +brew "googletest" brew "grpc" -brew "llvm" brew "llvm@14" brew "lz4" brew "ninja" -brew "numpy" brew "openssl@1.1" brew "protobuf" brew "python" @@ -39,4 +37,5 @@ brew "rapidjson" brew "snappy" brew "thrift" brew "wget" +brew "xsimd" brew "zstd" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9cc51737373..15bb7dcf84c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -47,7 +47,7 @@ if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() -set(ARROW_VERSION "10.0.0-SNAPSHOT") +set(ARROW_VERSION "11.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -122,12 +122,13 @@ endif() set(ARROW_GDB_DIR "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/gdb") set(ARROW_FULL_GDB_DIR "${CMAKE_INSTALL_FULL_DATADIR}/${PROJECT_NAME}/gdb") set(ARROW_GDB_AUTO_LOAD_DIR "${CMAKE_INSTALL_DATADIR}/gdb/auto-load") -set(ARROW_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") +set(ARROW_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake") set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "15.0" "14.0" "13.0" "12.0" @@ -137,9 +138,6 @@ set(ARROW_LLVM_VERSIONS "9" "8" "7") -list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) -string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR - "${ARROW_LLVM_VERSION_PRIMARY}") file(READ ${CMAKE_CURRENT_SOURCE_DIR}/../.env ARROW_ENV) string(REGEX MATCH "CLANG_TOOLS=[^\n]+" ARROW_ENV_CLANG_TOOLS_VERSION "${ARROW_ENV}") @@ -148,30 +146,6 @@ string(REGEX REPLACE "^CLANG_TOOLS=" "" ARROW_CLANG_TOOLS_VERSION string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_CLANG_TOOLS_VERSION_MAJOR "${ARROW_CLANG_TOOLS_VERSION}") -if(APPLE) - find_program(BREW_BIN brew) - if(BREW_BIN) - execute_process(COMMAND ${BREW_BIN} --prefix - "llvm@${ARROW_LLVM_VERSION_PRIMARY_MAJOR}" - OUTPUT_VARIABLE LLVM_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(NOT LLVM_BREW_PREFIX) - execute_process(COMMAND ${BREW_BIN} --prefix llvm - OUTPUT_VARIABLE LLVM_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - - execute_process(COMMAND ${BREW_BIN} --prefix "llvm@${ARROW_CLANG_TOOLS_VERSION_MAJOR}" - OUTPUT_VARIABLE CLANG_TOOLS_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(NOT CLANG_TOOLS_BREW_PREFIX) - execute_process(COMMAND ${BREW_BIN} --prefix llvm - OUTPUT_VARIABLE CLANG_TOOLS_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - endif() -endif() - if(WIN32 AND NOT MINGW) # This is used to handle builds using e.g. clang in an MSVC setting. set(MSVC_TOOLCHAIN TRUE) @@ -190,16 +164,6 @@ if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -# ---------------------------------------------------------------------- -# cmake options -include(DefineOptions) - -if(ARROW_BUILD_SHARED AND NOT ARROW_POSITION_INDEPENDENT_CODE) - message(WARNING "Can't disable position-independent code to build shared libraries, enabling" - ) - set(ARROW_POSITION_INDEPENDENT_CODE ON) -endif() - # Needed for linting targets, etc. if(${CMAKE_VERSION} VERSION_LESS "3.12.0") find_package(PythonInterp) @@ -215,10 +179,49 @@ else() set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE}) endif() +# ---------------------------------------------------------------------- +# cmake options +include(DefineOptions) + +if(ARROW_BUILD_SHARED AND NOT ARROW_POSITION_INDEPENDENT_CODE) + message(WARNING "Can't disable position-independent code to build shared libraries, enabling" + ) + set(ARROW_POSITION_INDEPENDENT_CODE ON) +endif() + +if(ARROW_USE_SCCACHE + AND NOT CMAKE_C_COMPILER_LAUNCHER + AND NOT CMAKE_CXX_COMPILER_LAUNCHER) + + find_program(SCCACHE_FOUND sccache) + + if(NOT SCCACHE_FOUND AND DEFINED ENV{SCCACHE_PATH}) + # cmake has problems finding sccache from within mingw + message(STATUS "Did not find sccache, using envvar fallback.") + set(SCCACHE_FOUND $ENV{SCCACHE_PATH}) + endif() + + # Only use sccache if a storage backend is configured + if(SCCACHE_FOUND + AND (DEFINED ENV{SCCACHE_AZURE_BLOB_CONTAINER} + OR DEFINED ENV{SCCACHE_BUCKET} + OR DEFINED ENV{SCCACHE_DIR} + OR DEFINED ENV{SCCACHE_GCS_BUCKET} + OR DEFINED ENV{SCCACHE_MEMCACHED} + OR DEFINED ENV{SCCACHE_REDIS} + )) + message(STATUS "Using sccache: ${SCCACHE_FOUND}") + set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_FOUND}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_FOUND}) + endif() +endif() + if(ARROW_USE_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER) + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) message(STATUS "Using ccache: ${CCACHE_FOUND}") set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_FOUND}) @@ -226,7 +229,7 @@ if(ARROW_USE_CCACHE # ARROW-3985: let ccache preserve C++ comments, because some of them may be # meaningful to the compiler set(ENV{CCACHE_COMMENTS} "1") - endif(CCACHE_FOUND) + endif() endif() if(ARROW_USE_PRECOMPILED_HEADERS AND ${CMAKE_VERSION} VERSION_LESS "3.16.0") @@ -344,88 +347,6 @@ if(UNIX) add_custom_target(iwyu-all ${BUILD_SUPPORT_DIR}/iwyu/iwyu.sh all) endif(UNIX) -# -# Set up various options -# - -if(ARROW_BUILD_BENCHMARKS - OR ARROW_BUILD_TESTS - OR ARROW_BUILD_INTEGRATION - OR ARROW_FUZZING) - set(ARROW_JSON ON) - set(ARROW_TESTING ON) -endif() - -if(ARROW_GANDIVA) - set(ARROW_WITH_RE2 ON) -endif() - -if(ARROW_BUILD_INTEGRATION AND ARROW_FLIGHT) - set(ARROW_FLIGHT_SQL ON) -endif() - -if(ARROW_FLIGHT_SQL) - set(ARROW_FLIGHT ON) -endif() - -if(ARROW_CUDA - OR ARROW_FLIGHT - OR ARROW_PARQUET - OR ARROW_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS) - set(ARROW_IPC ON) -endif() - -if(ARROW_SUBSTRAIT) - set(ARROW_PARQUET ON) - set(ARROW_IPC ON) - set(ARROW_COMPUTE ON) - set(ARROW_DATASET ON) -endif() - -if(ARROW_SKYHOOK) - set(ARROW_DATASET ON) - set(ARROW_PARQUET ON) - set(ARROW_WITH_LZ4 ON) - set(ARROW_WITH_SNAPPY ON) -endif() - -if(ARROW_DATASET) - set(ARROW_COMPUTE ON) - set(ARROW_FILESYSTEM ON) -endif() - -if(ARROW_PARQUET) - set(ARROW_COMPUTE ON) -endif() - -if(ARROW_PYTHON) - set(ARROW_COMPUTE ON) - set(ARROW_CSV ON) - set(ARROW_DATASET ON) - set(ARROW_FILESYSTEM ON) - set(ARROW_HDFS ON) - set(ARROW_JSON ON) -endif() - -if(MSVC_TOOLCHAIN) - # ORC doesn't build on windows - set(ARROW_ORC OFF) - # Plasma using glog is not fully tested on windows. - set(ARROW_USE_GLOG OFF) -endif() - -if(ARROW_JNI) - set(ARROW_BUILD_STATIC ON) -endif() - -if(ARROW_ORC) - set(ARROW_WITH_LZ4 ON) - set(ARROW_WITH_SNAPPY ON) - set(ARROW_WITH_ZLIB ON) - set(ARROW_WITH_ZSTD ON) -endif() - # datetime code used by iOS requires zlib support if(IOS) set(ARROW_WITH_ZLIB ON) @@ -463,8 +384,9 @@ if(NOT ARROW_BUILD_EXAMPLES) set(NO_EXAMPLES 1) endif() -if(NOT ARROW_FUZZING) - set(NO_FUZZING 1) +if(ARROW_FUZZING) + # Fuzzing builds enable ASAN without setting our home-grown option for it. + add_definitions(-DADDRESS_SANITIZER) endif() if(ARROW_LARGE_MEMORY_TESTS) @@ -567,10 +489,11 @@ endif() include(BuildUtils) enable_testing() -# For arrow.pc. Requires.private and Libs.private are used when -# "pkg-config --libs --static arrow" is used. -set(ARROW_PC_REQUIRES_PRIVATE) -set(ARROW_PC_LIBS_PRIVATE) +# For arrow.pc. Cflags.private, Libs.private and Requires.private are +# used when "pkg-config --cflags --libs --static arrow" is used. +set(ARROW_PC_CFLAGS_PRIVATE " -DARROW_STATIC") +set(ARROW_PC_LIBS_PRIVATE "") +set(ARROW_PC_REQUIRES_PRIVATE "") include(ThirdpartyToolchain) @@ -582,10 +505,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARROW_CXXFLAGS}") # C++ specific flags. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CXX_COMMON_FLAGS} ${ARROW_CXXFLAGS}") -# Remove --std=c++11 to avoid errors from C compilers -string(REPLACE "-std=c++11" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) +# Remove --std=c++17 to avoid errors from C compilers +string(REPLACE "-std=c++17" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) -# Add C++-only flags, like -std=c++11 +# Add C++-only flags, like -std=c++17 set(CMAKE_CXX_FLAGS "${CXX_ONLY_FLAGS} ${CMAKE_CXX_FLAGS}") # ASAN / TSAN / UBSAN @@ -603,6 +526,10 @@ endif() # CMAKE_CXX_FLAGS now fully assembled message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +message(STATUS "CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}: ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}" +) +message(STATUS "CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}: ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" +) include_directories(${CMAKE_CURRENT_BINARY_DIR}/src) include_directories(src) @@ -719,18 +646,19 @@ endif() # Note that arrow::hadoop is a header only target that refers # cpp/thirdparty/hadoop/include/. See # cpp/cmake_modules/ThirdpartyToolchain.cmake for details. -set(ARROW_LINK_LIBS arrow::flatbuffers arrow::hadoop) +set(ARROW_SHARED_LINK_LIBS arrow::flatbuffers arrow::hadoop) +set(ARROW_SHARED_INSTALL_INTERFACE_LIBS) set(ARROW_STATIC_LINK_LIBS arrow::flatbuffers arrow::hadoop) set(ARROW_STATIC_INSTALL_INTERFACE_LIBS) if(ARROW_USE_BOOST) - list(APPEND ARROW_LINK_LIBS Boost::headers) + list(APPEND ARROW_SHARED_LINK_LIBS Boost::headers) list(APPEND ARROW_STATIC_LINK_LIBS Boost::headers) endif() if(ARROW_USE_OPENSSL) set(ARROW_OPENSSL_LIBS OpenSSL::Crypto OpenSSL::SSL) - list(APPEND ARROW_LINK_LIBS ${ARROW_OPENSSL_LIBS}) + list(APPEND ARROW_SHARED_LINK_LIBS ${ARROW_OPENSSL_LIBS}) list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_OPENSSL_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_OPENSSL_LIBS}) endif() @@ -738,7 +666,7 @@ endif() if(ARROW_WITH_BROTLI) # Order is important for static linking set(ARROW_BROTLI_LIBS Brotli::brotlienc Brotli::brotlidec Brotli::brotlicommon) - list(APPEND ARROW_LINK_LIBS ${ARROW_BROTLI_LIBS}) + list(APPEND ARROW_SHARED_LINK_LIBS ${ARROW_BROTLI_LIBS}) list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_BROTLI_LIBS}) if(Brotli_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_BROTLI_LIBS}) @@ -753,9 +681,9 @@ if(ARROW_WITH_BZ2) endif() if(ARROW_WITH_LZ4) - list(APPEND ARROW_STATIC_LINK_LIBS lz4::lz4) + list(APPEND ARROW_STATIC_LINK_LIBS LZ4::lz4) if(lz4_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4::lz4) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS LZ4::lz4) endif() endif() @@ -781,7 +709,7 @@ if(ARROW_WITH_ZSTD) endif() if(ARROW_ORC) - list(APPEND ARROW_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_SHARED_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) list(APPEND ARROW_STATIC_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) if(ORC_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::liborc @@ -790,7 +718,7 @@ if(ARROW_ORC) endif() if(ARROW_GCS) - list(APPEND ARROW_LINK_LIBS google-cloud-cpp::storage) + list(APPEND ARROW_SHARED_LINK_LIBS google-cloud-cpp::storage) list(APPEND ARROW_STATIC_LINK_LIBS google-cloud-cpp::storage) if(google_cloud_cpp_storage_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS google-cloud-cpp::storage) @@ -798,7 +726,7 @@ if(ARROW_GCS) endif() if(ARROW_USE_GLOG) - list(APPEND ARROW_LINK_LIBS glog::glog) + list(APPEND ARROW_SHARED_LINK_LIBS glog::glog) list(APPEND ARROW_STATIC_LINK_LIBS glog::glog) if(GLOG_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS glog::glog) @@ -807,13 +735,26 @@ if(ARROW_USE_GLOG) endif() if(ARROW_S3) - list(APPEND ARROW_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) + list(APPEND ARROW_SHARED_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) list(APPEND ARROW_STATIC_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) + if(AWSSDK_SOURCE STREQUAL "SYSTEM") + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + aws-cpp-sdk-identity-management + aws-cpp-sdk-sts + aws-cpp-sdk-cognito-identity + aws-cpp-sdk-s3 + aws-cpp-sdk-core) + elseif(AWSSDK_SOURCE STREQUAL "BUNDLED") + if(UNIX) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) + endif() + endif() endif() if(ARROW_WITH_OPENTELEMETRY) list(APPEND - ARROW_LINK_LIBS + ARROW_SHARED_LINK_LIBS opentelemetry-cpp::trace opentelemetry-cpp::ostream_span_exporter opentelemetry-cpp::otlp_http_exporter) @@ -822,10 +763,21 @@ if(ARROW_WITH_OPENTELEMETRY) opentelemetry-cpp::trace opentelemetry-cpp::ostream_span_exporter opentelemetry-cpp::otlp_http_exporter) + if(opentelemetry_SOURCE STREQUAL "SYSTEM") + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + opentelemetry-cpp::trace + opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::otlp_http_exporter) + endif() + if(Protobuf_SOURCE STREQUAL "SYSTEM") + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) + endif() + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() if(ARROW_WITH_UTF8PROC) - list(APPEND ARROW_LINK_LIBS utf8proc::utf8proc) + list(APPEND ARROW_SHARED_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) if(utf8proc_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS utf8proc::utf8proc) @@ -833,7 +785,7 @@ if(ARROW_WITH_UTF8PROC) endif() if(ARROW_WITH_RE2) - list(APPEND ARROW_LINK_LIBS re2::re2) + list(APPEND ARROW_SHARED_LINK_LIBS re2::re2) list(APPEND ARROW_STATIC_LINK_LIBS re2::re2) if(re2_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS re2::re2) @@ -841,12 +793,12 @@ if(ARROW_WITH_RE2) endif() if(ARROW_WITH_RAPIDJSON) - list(APPEND ARROW_LINK_LIBS rapidjson::rapidjson) + list(APPEND ARROW_SHARED_LINK_LIBS rapidjson::rapidjson) list(APPEND ARROW_STATIC_LINK_LIBS rapidjson::rapidjson) endif() if(ARROW_USE_XSIMD) - list(APPEND ARROW_LINK_LIBS xsimd) + list(APPEND ARROW_SHARED_LINK_LIBS xsimd) list(APPEND ARROW_STATIC_LINK_LIBS xsimd) endif() @@ -863,6 +815,9 @@ add_dependencies(arrow_test_dependencies toolchain-tests) if(ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) if(ARROW_HDFS OR ARROW_ORC) + if(Protobuf_SOURCE STREQUAL "SYSTEM") + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) + endif() if(NOT MSVC_TOOLCHAIN) list(APPEND ARROW_STATIC_LINK_LIBS ${CMAKE_DL_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) @@ -873,8 +828,7 @@ endif() set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) if(NOT MSVC_TOOLCHAIN) - list(APPEND ARROW_LINK_LIBS ${CMAKE_DL_LIBS}) - list(APPEND ARROW_SHARED_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) + list(APPEND ARROW_SHARED_LINK_LIBS ${CMAKE_DL_LIBS}) endif() set(ARROW_TEST_LINK_TOOLCHAIN arrow::flatbuffers GTest::gtest_main GTest::gtest @@ -890,11 +844,11 @@ if(ARROW_BUILD_BENCHMARKS) toolchain-benchmarks) endif() -set(ARROW_TEST_STATIC_LINK_LIBS arrow_testing_static arrow_static ${ARROW_LINK_LIBS} - ${ARROW_TEST_LINK_TOOLCHAIN}) +set(ARROW_TEST_STATIC_LINK_LIBS arrow_testing_static arrow_static + ${ARROW_SHARED_LINK_LIBS} ${ARROW_TEST_LINK_TOOLCHAIN}) -set(ARROW_TEST_SHARED_LINK_LIBS arrow_testing_shared arrow_shared ${ARROW_LINK_LIBS} - ${ARROW_TEST_LINK_TOOLCHAIN}) +set(ARROW_TEST_SHARED_LINK_LIBS arrow_testing_shared arrow_shared + ${ARROW_SHARED_LINK_LIBS} ${ARROW_TEST_LINK_TOOLCHAIN}) if(NOT MSVC) set(ARROW_TEST_SHARED_LINK_LIBS ${ARROW_TEST_SHARED_LINK_LIBS} ${CMAKE_DL_LIBS}) @@ -934,13 +888,13 @@ if(ARROW_BUILD_BENCHMARKS) endif() if(ARROW_JEMALLOC) - list(APPEND ARROW_LINK_LIBS jemalloc) - list(APPEND ARROW_STATIC_LINK_LIBS jemalloc) + list(APPEND ARROW_SHARED_LINK_LIBS jemalloc::jemalloc) + list(APPEND ARROW_STATIC_LINK_LIBS jemalloc::jemalloc) endif() if(ARROW_MIMALLOC) add_definitions(-DARROW_MIMALLOC) - list(APPEND ARROW_LINK_LIBS mimalloc::mimalloc) + list(APPEND ARROW_SHARED_LINK_LIBS mimalloc::mimalloc) list(APPEND ARROW_STATIC_LINK_LIBS mimalloc::mimalloc) endif() @@ -964,7 +918,7 @@ if(NOT WIN32 AND NOT APPLE) list(APPEND ARROW_SYSTEM_LINK_LIBS rt) endif() -list(APPEND ARROW_LINK_LIBS ${ARROW_SYSTEM_LINK_LIBS}) +list(APPEND ARROW_SHARED_LINK_LIBS ${ARROW_SYSTEM_LINK_LIBS}) list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_SYSTEM_LINK_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_SYSTEM_LINK_LIBS}) @@ -986,15 +940,6 @@ if(ARROW_PARQUET) endif() endif() -if(ARROW_JNI) - if(ARROW_ORC) - add_subdirectory(../java/adapter/orc/src/main/cpp ./java/orc/jni) - endif() - if(ARROW_DATASET) - add_subdirectory(../java/dataset/src/main/cpp ./java/dataset/jni) - endif() -endif() - if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 46eef600024..8adbb53bb86 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -117,12 +117,48 @@ "ARROW_GANDIVA": "ON" } }, + { + "name": "features-python-minimal", + "inherits": [ + "features-minimal" + ], + "hidden": true, + "cacheVariables": { + "ARROW_COMPUTE": "ON", + "ARROW_CSV": "ON", + "ARROW_FILESYSTEM": "ON", + "ARROW_JSON": "ON" + } + }, { "name": "features-python", - "inherits": "features-main", + "inherits": [ + "features-main" + ], "hidden": true, "cacheVariables": { - "ARROW_PYTHON": "ON" + "ARROW_COMPUTE": "ON", + "ARROW_CSV": "ON", + "ARROW_DATASET": "ON", + "ARROW_FILESYSTEM": "ON", + "ARROW_JSON": "ON", + "ARROW_ORC": "ON" + } + }, + { + "name": "features-python-maximal", + "inherits": [ + "features-cuda", + "features-filesystems", + "features-flight", + "features-gandiva", + "features-main", + "features-python-minimal" + ], + "hidden": true, + "cacheVariables": { + "ARROW_ORC": "ON", + "PARQUET_REQUIRE_ENCRYPTION": "ON" } }, { @@ -133,10 +169,9 @@ "features-filesystems", "features-flight", "features-gandiva", - "features-python" + "features-python-maximal" ], "hidden": true, - "displayName": "Debug build with everything enabled (except benchmarks and CUDA)", "cacheVariables": { "ARROW_BUILD_EXAMPLES": "ON", "ARROW_BUILD_UTILITIES": "ON", @@ -194,10 +229,22 @@ "displayName": "Debug build with tests and Gandiva", "cacheVariables": {} }, + { + "name": "ninja-debug-python-minimal", + "inherits": ["base-debug", "features-python-minimal"], + "displayName": "Debug build for PyArrow with minimal features", + "cacheVariables": {} + }, { "name": "ninja-debug-python", "inherits": ["base-debug", "features-python"], - "displayName": "Debug build with tests and Python support", + "displayName": "Debug build for PyArrow with common features (for backward compatibility)", + "cacheVariables": {} + }, + { + "name": "ninja-debug-python-maximal", + "inherits": ["base-debug", "features-python-maximal"], + "displayName": "Debug build for PyArrow with everything enabled (except CUDA)", "cacheVariables": {} }, { @@ -243,10 +290,22 @@ "displayName": "Release build with Gandiva", "cacheVariables": {} }, + { + "name": "ninja-release-python-minimal", + "inherits": ["base-release", "features-python-minimal"], + "displayName": "Release build for PyArrow with minimal features", + "cacheVariables": {} + }, { "name": "ninja-release-python", "inherits": ["base-release", "features-python"], - "displayName": "Release build with Python support", + "displayName": "Release build for PyArrow with common features (for backward compatibility)", + "cacheVariables": {} + }, + { + "name": "ninja-release-python-maximal", + "inherits": ["base-release", "features-python-maximal"], + "displayName": "Release build for PyArrow with everything enabled (except CUDA)", "cacheVariables": {} }, { diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 8978dba534b..dabf8cf8c0b 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2169,9 +2169,12 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ PARQUET_EXPORT= \ + GANDIVA_EXPORT= \ ARROW_EXPORT= \ ARROW_DS_EXPORT= \ + ARROW_ENGINE_EXPORT= \ ARROW_FLIGHT_EXPORT= \ + ARROW_FLIGHT_SQL_EXPORT= \ ARROW_EXTERN_TEMPLATE= \ ARROW_DEPRECATED(x)= diff --git a/cpp/build-support/cpplint.py b/cpp/build-support/cpplint.py index a40c538e79c..cf1859bb6d4 100755 --- a/cpp/build-support/cpplint.py +++ b/cpp/build-support/cpplint.py @@ -41,6 +41,11 @@ same line, but it is far from perfect (in either direction). """ +# cpplint predates fstrings +# pylint: disable=consider-using-f-string + +# pylint: disable=invalid-name + import codecs import copy import getopt @@ -52,46 +57,40 @@ import sre_compile import string import sys +import sysconfig import unicodedata import xml.etree.ElementTree -# if empty, use defaults -_header_extensions = set([]) - # if empty, use defaults _valid_extensions = set([]) +__VERSION__ = '1.6.1' -# Files with any of these extensions are considered to be -# header files (and will undergo different style checks). -# This set can be extended by using the --headers -# option (also supported in CPPLINT.cfg) -def GetHeaderExtensions(): - if not _header_extensions: - return set(['h', 'hpp', 'hxx', 'h++', 'cuh']) - return _header_extensions - -# The allowed extensions for file names -# This is set by --extensions flag -def GetAllExtensions(): - if not _valid_extensions: - return GetHeaderExtensions().union(set(['c', 'cc', 'cpp', 'cxx', 'c++', 'cu'])) - return _valid_extensions - -def GetNonHeaderExtensions(): - return GetAllExtensions().difference(GetHeaderExtensions()) +try: + # -- pylint: disable=used-before-assignment + xrange # Python 2 +except NameError: + # -- pylint: disable=redefined-builtin + xrange = range # Python 3 _USAGE = """ -Syntax: cpplint.py [--verbose=#] [--output=emacs|eclipse|vs7|junit] +Syntax: cpplint.py [--verbose=#] [--output=emacs|eclipse|vs7|junit|sed|gsed] [--filter=-x,+y,...] - [--counting=total|toplevel|detailed] [--repository=path] - [--root=subdir] [--linelength=digits] [--recursive] + [--counting=total|toplevel|detailed] [--root=subdir] + [--repository=path] + [--linelength=digits] [--headers=x,y,...] + [--recursive] [--exclude=path] - [--headers=ext1,ext2] [--extensions=hpp,cpp,...] + [--includeorder=default|standardcfirst] + [--quiet] + [--version] [file] ... + Style checker for C/C++ source files. + This is a fork of the Google style checker with minor extensions. + The style guidelines this tries to follow are those in https://google.github.io/styleguide/cppguide.html @@ -110,11 +109,16 @@ def GetNonHeaderExtensions(): Flags: - output=emacs|eclipse|vs7|junit - By default, the output is formatted to ease emacs parsing. Output - compatible with eclipse (eclipse), Visual Studio (vs7), and JUnit - XML parsers such as those used in Jenkins and Bamboo may also be - used. Other formats are unsupported. + output=emacs|eclipse|vs7|junit|sed|gsed + By default, the output is formatted to ease emacs parsing. Visual Studio + compatible output (vs7) may also be used. Further support exists for + eclipse (eclipse), and JUnit (junit). XML parsers such as those used + in Jenkins and Bamboo may also be used. + The sed format outputs sed commands that should fix some of the errors. + Note that this requires gnu sed. If that is installed as gsed on your + system (common e.g. on macOS with homebrew) you can use the gsed output + format. Sed commands are written to stdout, not stderr, so you should be + able to pipe output straight to a shell to run the fixes. verbose=# Specify a number 0-5 to restrict errors to certain verbosity levels. @@ -122,19 +126,18 @@ def GetNonHeaderExtensions(): likely to be false positives. quiet - Suppress output other than linting errors, such as information about - which files have been processed and excluded. + Don't print anything if no errors are found. filter=-x,+y,... Specify a comma-separated list of category-filters to apply: only error messages whose category names pass the filters will be printed. (Category names are printed with the message and look like "[whitespace/indent]".) Filters are evaluated left to right. - "-FOO" and "FOO" means "do not print categories that start with FOO". + "-FOO" means "do not print categories that start with FOO". "+FOO" means "do print categories that start with FOO". Examples: --filter=-whitespace,+whitespace/braces - --filter=whitespace,runtime/printf,+runtime/printf_format + --filter=-whitespace,-runtime/printf,+runtime/printf_format --filter=-,+build/include_what_you_use To see a list of all the categories used in cpplint, pass no arg: @@ -172,19 +175,21 @@ def GetNonHeaderExtensions(): Bob => SRC_CHROME_BROWSER_UI_BROWSER_H_ root=subdir - The root directory used for deriving header guard CPP variables. This - directory is relative to the top level directory of the repository which - by default is determined by searching for a directory that contains .git, - .hg, or .svn but can also be controlled with the --repository flag. If - the specified directory does not exist, this flag is ignored. + The root directory used for deriving header guard CPP variable. + This directory is relative to the top level directory of the repository + which by default is determined by searching for a directory that contains + .git, .hg, or .svn but can also be controlled with the --repository flag. + If the specified directory does not exist, this flag is ignored. Examples: - Assuming that src is the top level directory of the repository, the - header guard CPP variables for src/chrome/browser/ui/browser.h are: + Assuming that src is the top level directory of the repository (and + cwd=top/src), the header guard CPP variables for + src/chrome/browser/ui/browser.h are: No flag => CHROME_BROWSER_UI_BROWSER_H_ --root=chrome => BROWSER_UI_BROWSER_H_ --root=chrome/browser => UI_BROWSER_H_ + --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_ linelength=digits This is the allowed line length for the project. The default value is @@ -216,13 +221,24 @@ def GetNonHeaderExtensions(): Examples: --extensions=%s - headers=extension,extension,... - The allowed header extensions that cpplint will consider to be header files - (by default, only files with extensions %s - will be assumed to be headers) + includeorder=default|standardcfirst + For the build/include_order rule, the default is to blindly assume angle + bracket includes with file extension are c-system-headers (default), + even knowing this will have false classifications. + The default is established at google. + standardcfirst means to instead use an allow-list of known c headers and + treat all others as separate group of "other system headers". The C headers + included are those of the C-standard lib and closely related ones. + + headers=x,y,... + The header extensions that cpplint will treat as .h in checks. Values are + automatically added to --extensions list. + (by default, only files with extensions %s will be assumed to be headers) Examples: --headers=%s + --headers=hpp,hxx + --headers=hpp cpplint.py supports per-directory configurations specified in CPPLINT.cfg files. CPPLINT.cfg file can contain a number of key=value pairs. @@ -233,6 +249,7 @@ def GetNonHeaderExtensions(): exclude_files=regex linelength=80 root=subdir + headers=x,y,... "set noparent" option prevents cpplint from traversing directory tree upwards looking for more .cfg files in parent directories. This option @@ -246,13 +263,16 @@ def GetNonHeaderExtensions(): a file name. If the expression matches, the file is skipped and not run through the linter. - "linelength" specifies the allowed line length for the project. + "linelength" allows to specify the allowed line length for the project. The "root" option is similar in function to the --root flag (see example - above). + above). Paths are relative to the directory of the CPPLINT.cfg. + + The "headers" option is similar in function to the --headers flag + (see example above). CPPLINT.cfg has an effect on files in the same directory and all - subdirectories, unless overridden by a nested configuration file. + sub-directories, unless overridden by a nested configuration file. Example file: filter=-build/include_order,+build/include_alpha @@ -261,11 +281,8 @@ def GetNonHeaderExtensions(): The above example disables build/include_order warning and enables build/include_alpha as well as excludes all .cc from being processed by linter, in the current directory (where the .cfg - file is located) and all subdirectories. -""" % (list(GetAllExtensions()), - ','.join(list(GetAllExtensions())), - GetHeaderExtensions(), - ','.join(GetHeaderExtensions())) + file is located) and all sub-directories. +""" # We categorize each error message we print. Here are the categories. # We want an explicit list so we can list them all in cpplint --filter=. @@ -286,6 +303,7 @@ def GetNonHeaderExtensions(): 'build/include_alpha', 'build/include_order', 'build/include_what_you_use', + 'build/namespaces_headers', 'build/namespaces_literals', 'build/namespaces', 'build/printf_format', @@ -342,6 +360,13 @@ def GetNonHeaderExtensions(): 'whitespace/todo', ] +# keywords to use with --outputs which generate stdout for machine processing +_MACHINE_OUTPUTS = [ + 'junit', + 'sed', + 'gsed' +] + # These error categories are no longer enforced by cpplint, but for backwards- # compatibility they may still appear in NOLINT comments. _LEGACY_ERROR_CATEGORIES = [ @@ -349,6 +374,12 @@ def GetNonHeaderExtensions(): 'readability/function', ] +# These prefixes for categories should be ignored since they relate to other +# tools which also use the NOLINT syntax, e.g. clang-tidy. +_OTHER_NOLINT_CATEGORY_PREFIXES = [ + 'clang-analyzer', + ] + # The default state of the category filter. This is overridden by the --filter= # flag. By default all errors are on, so only add here categories that should be # off by default (i.e., categories that must be enabled by the --filter= flags). @@ -477,6 +508,18 @@ def GetNonHeaderExtensions(): 'utility', 'valarray', 'vector', + # 17.6.1.2 C++14 headers + 'shared_mutex', + # 17.6.1.2 C++17 headers + 'any', + 'charconv', + 'codecvt', + 'execution', + 'filesystem', + 'memory_resource', + 'optional', + 'string_view', + 'variant', # 17.6.1.2 C++ headers for C library facilities 'cassert', 'ccomplex', @@ -506,6 +549,186 @@ def GetNonHeaderExtensions(): 'cwctype', ]) +# C headers +_C_HEADERS = frozenset([ + # System C headers + 'assert.h', + 'complex.h', + 'ctype.h', + 'errno.h', + 'fenv.h', + 'float.h', + 'inttypes.h', + 'iso646.h', + 'limits.h', + 'locale.h', + 'math.h', + 'setjmp.h', + 'signal.h', + 'stdalign.h', + 'stdarg.h', + 'stdatomic.h', + 'stdbool.h', + 'stddef.h', + 'stdint.h', + 'stdio.h', + 'stdlib.h', + 'stdnoreturn.h', + 'string.h', + 'tgmath.h', + 'threads.h', + 'time.h', + 'uchar.h', + 'wchar.h', + 'wctype.h', + # additional POSIX C headers + 'aio.h', + 'arpa/inet.h', + 'cpio.h', + 'dirent.h', + 'dlfcn.h', + 'fcntl.h', + 'fmtmsg.h', + 'fnmatch.h', + 'ftw.h', + 'glob.h', + 'grp.h', + 'iconv.h', + 'langinfo.h', + 'libgen.h', + 'monetary.h', + 'mqueue.h', + 'ndbm.h', + 'net/if.h', + 'netdb.h', + 'netinet/in.h', + 'netinet/tcp.h', + 'nl_types.h', + 'poll.h', + 'pthread.h', + 'pwd.h', + 'regex.h', + 'sched.h', + 'search.h', + 'semaphore.h', + 'setjmp.h', + 'signal.h', + 'spawn.h', + 'strings.h', + 'stropts.h', + 'syslog.h', + 'tar.h', + 'termios.h', + 'trace.h', + 'ulimit.h', + 'unistd.h', + 'utime.h', + 'utmpx.h', + 'wordexp.h', + # additional GNUlib headers + 'a.out.h', + 'aliases.h', + 'alloca.h', + 'ar.h', + 'argp.h', + 'argz.h', + 'byteswap.h', + 'crypt.h', + 'endian.h', + 'envz.h', + 'err.h', + 'error.h', + 'execinfo.h', + 'fpu_control.h', + 'fstab.h', + 'fts.h', + 'getopt.h', + 'gshadow.h', + 'ieee754.h', + 'ifaddrs.h', + 'libintl.h', + 'mcheck.h', + 'mntent.h', + 'obstack.h', + 'paths.h', + 'printf.h', + 'pty.h', + 'resolv.h', + 'shadow.h', + 'sysexits.h', + 'ttyent.h', + # Additional linux glibc headers + 'dlfcn.h', + 'elf.h', + 'features.h', + 'gconv.h', + 'gnu-versions.h', + 'lastlog.h', + 'libio.h', + 'link.h', + 'malloc.h', + 'memory.h', + 'netash/ash.h', + 'netatalk/at.h', + 'netax25/ax25.h', + 'neteconet/ec.h', + 'netipx/ipx.h', + 'netiucv/iucv.h', + 'netpacket/packet.h', + 'netrom/netrom.h', + 'netrose/rose.h', + 'nfs/nfs.h', + 'nl_types.h', + 'nss.h', + 're_comp.h', + 'regexp.h', + 'sched.h', + 'sgtty.h', + 'stab.h', + 'stdc-predef.h', + 'stdio_ext.h', + 'syscall.h', + 'termio.h', + 'thread_db.h', + 'ucontext.h', + 'ustat.h', + 'utmp.h', + 'values.h', + 'wait.h', + 'xlocale.h', + # Hardware specific headers + 'arm_neon.h', + 'emmintrin.h', + 'xmmintin.h', + ]) + +# Folders of C libraries so commonly used in C++, +# that they have parity with standard C libraries. +C_STANDARD_HEADER_FOLDERS = frozenset([ + # standard C library + "sys", + # glibc for linux + "arpa", + "asm-generic", + "bits", + "gnu", + "net", + "netinet", + "protocols", + "rpc", + "rpcsvc", + "scsi", + # linux kernel header + "drm", + "linux", + "misc", + "mtd", + "rdma", + "sound", + "video", + "xen", + ]) + # Type names _TYPES = re.compile( r'^(?:' @@ -592,9 +815,10 @@ def GetNonHeaderExtensions(): # _IncludeState.CheckNextIncludeOrder(). _C_SYS_HEADER = 1 _CPP_SYS_HEADER = 2 -_LIKELY_MY_HEADER = 3 -_POSSIBLE_MY_HEADER = 4 -_OTHER_HEADER = 5 +_OTHER_SYS_HEADER = 3 +_LIKELY_MY_HEADER = 4 +_POSSIBLE_MY_HEADER = 5 +_OTHER_HEADER = 6 # These constants define the current inline assembly state _NO_ASM = 0 # Outside of inline assembly block @@ -614,6 +838,22 @@ def GetNonHeaderExtensions(): # Match string that indicates we're working on a Linux Kernel file. _SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)') +# Commands for sed to fix the problem +_SED_FIXUPS = { + 'Remove spaces around =': r's/ = /=/', + 'Remove spaces around !=': r's/ != /!=/', + 'Remove space before ( in if (': r's/if (/if(/', + 'Remove space before ( in for (': r's/for (/for(/', + 'Remove space before ( in while (': r's/while (/while(/', + 'Remove space before ( in switch (': r's/switch (/switch(/', + 'Should have a space between // and comment': r's/\/\//\/\/ /', + 'Missing space before {': r's/\([^ ]\){/\1 {/', + 'Tab found, replace by spaces': r's/\t/ /g', + 'Line ends in whitespace. Consider deleting these extra spaces.': r's/\s*$//', + 'You don\'t need a ; after a }': r's/};/}/', + 'Missing space after ,': r's/,\([^ ]\)/, \1/g', +} + _regexp_compile_cache = {} # {str, set(int)}: a map from error categories to sets of linenumbers @@ -623,6 +863,7 @@ def GetNonHeaderExtensions(): # The root directory used for deriving header guard CPP variable. # This is set by --root flag. _root = None +_root_debug = False # The top level repository directory. If set, _root is calculated relative to # this directory instead of the directory containing version control artifacts. @@ -632,27 +873,26 @@ def GetNonHeaderExtensions(): # Files to exclude from linting. This is set by the --exclude flag. _excludes = None -# Whether to suppress PrintInfo messages +# Whether to supress all PrintInfo messages, UNRELATED to --quiet flag _quiet = False # The allowed line length of files. # This is set by --linelength flag. _line_length = 80 -try: - xrange(1, 0) -except NameError: - # -- pylint: disable=redefined-builtin - xrange = range +# This allows to use different include order rule than default +_include_order = "default" try: + # -- pylint: disable=used-before-assignment unicode except NameError: # -- pylint: disable=redefined-builtin basestring = unicode = str try: - long(2) + # -- pylint: disable=used-before-assignment + long except NameError: # -- pylint: disable=redefined-builtin long = int @@ -673,12 +913,58 @@ def unicode_escape_decode(x): else: return x +# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc. +# This is set by --headers flag. +_hpp_headers = set([]) + # {str, bool}: a map from error categories to booleans which indicate if the # category should be suppressed for every line. _global_error_suppressions = {} +def ProcessHppHeadersOption(val): + global _hpp_headers + try: + _hpp_headers = {ext.strip() for ext in val.split(',')} + except ValueError: + PrintUsage('Header extensions must be comma separated list.') + +def ProcessIncludeOrderOption(val): + if val is None or val == "default": + pass + elif val == "standardcfirst": + global _include_order + _include_order = val + else: + PrintUsage('Invalid includeorder value %s. Expected default|standardcfirst') +def IsHeaderExtension(file_extension): + return file_extension in GetHeaderExtensions() +def GetHeaderExtensions(): + if _hpp_headers: + return _hpp_headers + if _valid_extensions: + return {h for h in _valid_extensions if 'h' in h} + return set(['h', 'hh', 'hpp', 'hxx', 'h++', 'cuh']) + +# The allowed extensions for file names +# This is set by --extensions flag +def GetAllExtensions(): + return GetHeaderExtensions().union(_valid_extensions or set( + ['c', 'cc', 'cpp', 'cxx', 'c++', 'cu'])) + +def ProcessExtensionsOption(val): + global _valid_extensions + try: + extensions = [ext.strip() for ext in val.split(',')] + _valid_extensions = set(extensions) + except ValueError: + PrintUsage('Extensions should be a comma-separated list of values;' + 'for example: extensions=hpp,cpp\n' + 'This could not be parsed: "%s"' % (val,)) + +def GetNonHeaderExtensions(): + return GetAllExtensions().difference(GetHeaderExtensions()) def ParseNolintSuppressions(filename, raw_line, linenum, error): """Updates the global list of line error-suppressions. @@ -707,6 +993,9 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error): category = category[1:-1] if category in _ERROR_CATEGORIES: _error_suppressions.setdefault(category, set()).add(suppressed_line) + elif any(c for c in _OTHER_NOLINT_CATEGORY_PREFIXES if category.startswith(c)): + # Ignore any categories from other tools. + pass elif category not in _LEGACY_ERROR_CATEGORIES: error(filename, linenum, 'readability/nolint', 5, 'Unknown NOLINT error category: %s' % category) @@ -812,11 +1101,13 @@ class _IncludeState(object): _MY_H_SECTION = 1 _C_SECTION = 2 _CPP_SECTION = 3 - _OTHER_H_SECTION = 4 + _OTHER_SYS_SECTION = 4 + _OTHER_H_SECTION = 5 _TYPE_NAMES = { _C_SYS_HEADER: 'C system header', _CPP_SYS_HEADER: 'C++ system header', + _OTHER_SYS_HEADER: 'other system header', _LIKELY_MY_HEADER: 'header this file implements', _POSSIBLE_MY_HEADER: 'header this file may implement', _OTHER_HEADER: 'other header', @@ -826,6 +1117,7 @@ class _IncludeState(object): _MY_H_SECTION: 'a header this file implements', _C_SECTION: 'C system header', _CPP_SECTION: 'C++ system header', + _OTHER_SYS_SECTION: 'other system header', _OTHER_H_SECTION: 'other header', } @@ -939,6 +1231,12 @@ def CheckNextIncludeOrder(self, header_type): else: self._last_header = '' return error_message + elif header_type == _OTHER_SYS_HEADER: + if self._section <= self._OTHER_SYS_SECTION: + self._section = self._OTHER_SYS_SECTION + else: + self._last_header = '' + return error_message elif header_type == _LIKELY_MY_HEADER: if self._section <= self._MY_H_SECTION: self._section = self._MY_H_SECTION @@ -973,12 +1271,15 @@ def __init__(self): self._filters_backup = self.filters[:] self.counting = 'total' # In what way are we counting errors? self.errors_by_category = {} # string to int dict storing error counts + self.quiet = False # Suppress non-error messagess? # output format: # "emacs" - format that emacs can parse (default) # "eclipse" - format that eclipse can parse # "vs7" - format that Microsoft Visual Studio 7 can parse # "junit" - format that Jenkins, Bamboo, etc can parse + # "sed" - returns a gnu sed command to fix the problem + # "gsed" - like sed, but names the command gsed, e.g. for macOS homebrew users self.output_format = 'emacs' # For JUnit output, save errors and failures until the end so that they @@ -990,6 +1291,12 @@ def SetOutputFormat(self, output_format): """Sets the output format for errors.""" self.output_format = output_format + def SetQuiet(self, quiet): + """Sets the module's quiet settings, and returns the previous setting.""" + last_quiet = self.quiet + self.quiet = quiet + return last_quiet + def SetVerboseLevel(self, level): """Sets the module's verbosity, and returns the previous setting.""" last_verbose_level = self.verbose_level @@ -1061,8 +1368,10 @@ def PrintErrorCounts(self): self.PrintInfo('Total errors found: %d\n' % self.error_count) def PrintInfo(self, message): - if not _quiet and self.output_format != 'junit': - sys.stderr.write(message) + # _quiet does not represent --quiet flag. + # Hide infos from stdout to keep stdout pure for machine consumption + if not _quiet and self.output_format not in _MACHINE_OUTPUTS: + sys.stdout.write(message) def PrintError(self, message): if self.output_format == 'junit': @@ -1079,9 +1388,9 @@ def FormatJUnitXML(self): num_failures = len(self._junit_failures) testsuite = xml.etree.ElementTree.Element('testsuite') - testsuite.attrib['name'] = 'cpplint' testsuite.attrib['errors'] = str(num_errors) testsuite.attrib['failures'] = str(num_failures) + testsuite.attrib['name'] = 'cpplint' if num_errors == 0 and num_failures == 0: testsuite.attrib['tests'] = str(1) @@ -1130,6 +1439,14 @@ def _SetOutputFormat(output_format): """Sets the module's output format.""" _cpplint_state.SetOutputFormat(output_format) +def _Quiet(): + """Return's the module's quiet setting.""" + return _cpplint_state.quiet + +def _SetQuiet(quiet): + """Set the module's quiet status, and return previous setting.""" + return _cpplint_state.SetQuiet(quiet) + def _VerboseLevel(): """Returns the module's verbosity setting.""" @@ -1267,7 +1584,7 @@ def RepositoryName(self): If we have a real absolute path name here we can try to do something smart: detecting the root of the checkout and truncating /path/to/checkout from the name so that we get header guards that don't include things like - "C:\Documents and Settings\..." or "/home/username/..." in them and thus + "C:\\Documents and Settings\\..." or "/home/username/..." in them and thus people on different computers who have checked the source out to different locations won't see bogus errors. """ @@ -1405,14 +1722,21 @@ def Error(filename, linenum, category, confidence, message): if _ShouldPrintError(category, confidence, linenum): _cpplint_state.IncrementErrorCount(category) if _cpplint_state.output_format == 'vs7': - _cpplint_state.PrintError('%s(%s): warning: %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) + _cpplint_state.PrintError('%s(%s): error cpplint: [%s] %s [%d]\n' % ( + filename, linenum, category, message, confidence)) elif _cpplint_state.output_format == 'eclipse': sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( filename, linenum, message, category, confidence)) elif _cpplint_state.output_format == 'junit': - _cpplint_state.AddJUnitFailure(filename, linenum, message, category, - confidence) + _cpplint_state.AddJUnitFailure(filename, linenum, message, category, + confidence) + elif _cpplint_state.output_format in ['sed', 'gsed']: + if message in _SED_FIXUPS: + sys.stdout.write(_cpplint_state.output_format + " -i '%s%s' %s # %s [%s] [%d]\n" % ( + linenum, _SED_FIXUPS[message], filename, message, category, confidence)) + else: + sys.stderr.write('# %s:%s: "%s" [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) else: final_message = '%s:%s: %s [%s] [%d]\n' % ( filename, linenum, message, category, confidence) @@ -1553,7 +1877,7 @@ def FindNextMultiLineCommentEnd(lines, lineix): def RemoveMultiLineCommentsFromRange(lines, begin, end): """Clears a range of lines for multi-line comments.""" - # Having // dummy comments makes the lines non-empty, so we will not get + # Having // comments makes the lines non-empty, so we will not get # unnecessary blank line warnings later in the code. for i in range(begin, end): lines[i] = '/**/' @@ -1608,6 +1932,7 @@ def __init__(self, lines): self.raw_lines = lines self.num_lines = len(lines) self.lines_without_raw_strings = CleanseRawStrings(lines) + # # pylint: disable=consider-using-enumerate for linenum in range(len(self.lines_without_raw_strings)): self.lines.append(CleanseComments( self.lines_without_raw_strings[linenum])) @@ -1927,8 +2252,8 @@ def CheckForCopyright(filename, lines, error): """Logs an error if no Copyright message appears at the top of the file.""" # We'll say it should occur by line 10. Don't forget there's a - # dummy line at the front. - for line in range(1, min(len(lines), 11)): + # placeholder line at the front. + for line in xrange(1, min(len(lines), 11)): if re.search(r'Copyright', lines[line], re.I): break else: # means no copyright line was found error(filename, 0, 'legal/copyright', 5, @@ -1951,6 +2276,30 @@ def GetIndentLevel(line): else: return 0 +def PathSplitToList(path): + """Returns the path split into a list by the separator. + + Args: + path: An absolute or relative path (e.g. '/a/b/c/' or '../a') + + Returns: + A list of path components (e.g. ['a', 'b', 'c]). + """ + lst = [] + while True: + (head, tail) = os.path.split(path) + if head == path: # absolute paths end + lst.append(head) + break + if tail == path: # relative paths end + lst.append(tail) + break + + path = head + lst.append(tail) + + lst.reverse() + return lst def GetHeaderGuardCPPVariable(filename): """Returns the CPP variable that should be used as a header guard. @@ -1973,13 +2322,59 @@ def GetHeaderGuardCPPVariable(filename): fileinfo = FileInfo(filename) file_path_from_root = fileinfo.RepositoryName() - if _root: - suffix = os.sep - # On Windows using directory separator will leave us with - # "bogus escape error" unless we properly escape regex. - if suffix == '\\': - suffix += '\\' - file_path_from_root = re.sub('^' + _root + suffix, '', file_path_from_root) + + def FixupPathFromRoot(): + if _root_debug: + sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n" + % (_root, fileinfo.RepositoryName())) + + # Process the file path with the --root flag if it was set. + if not _root: + if _root_debug: + sys.stderr.write("_root unspecified\n") + return file_path_from_root + + def StripListPrefix(lst, prefix): + # f(['x', 'y'], ['w, z']) -> None (not a valid prefix) + if lst[:len(prefix)] != prefix: + return None + # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd'] + return lst[(len(prefix)):] + + # root behavior: + # --root=subdir , lstrips subdir from the header guard + maybe_path = StripListPrefix(PathSplitToList(file_path_from_root), + PathSplitToList(_root)) + + if _root_debug: + sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," + + " _root=%s)\n") % (maybe_path, file_path_from_root, _root)) + + if maybe_path: + return os.path.join(*maybe_path) + + # --root=.. , will prepend the outer directory to the header guard + full_path = fileinfo.FullName() + # adapt slashes for windows + root_abspath = os.path.abspath(_root).replace('\\', '/') + + maybe_path = StripListPrefix(PathSplitToList(full_path), + PathSplitToList(root_abspath)) + + if _root_debug: + sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " + + "root_abspath=%s)\n") % (maybe_path, full_path, root_abspath)) + + if maybe_path: + return os.path.join(*maybe_path) + + if _root_debug: + sys.stderr.write("_root ignore, returning %s\n" % (file_path_from_root)) + + # --root=FAKE_DIR is ignored + return file_path_from_root + + file_path_from_root = FixupPathFromRoot() return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_' @@ -2095,22 +2490,28 @@ def CheckHeaderFileIncluded(filename, include_state, error): return for ext in GetHeaderExtensions(): - basefilename = filename[0:len(filename) - len(fileinfo.Extension())] - headerfile = basefilename + '.' + ext - if not os.path.exists(headerfile): - continue - headername = FileInfo(headerfile).RepositoryName() - first_include = None - for section_list in include_state.include_list: - for f in section_list: - if headername in f[0] or f[0] in headername: - return - if not first_include: - first_include = f[1] + basefilename = filename[0:len(filename) - len(fileinfo.Extension())] + headerfile = basefilename + '.' + ext + if not os.path.exists(headerfile): + continue + headername = FileInfo(headerfile).RepositoryName() + first_include = None + include_uses_unix_dir_aliases = False + for section_list in include_state.include_list: + for f in section_list: + include_text = f[0] + if "./" in include_text: + include_uses_unix_dir_aliases = True + if headername in include_text or include_text in headername: + return + if not first_include: + first_include = f[1] + + message = '%s should include its header file %s' % (fileinfo.RepositoryName(), headername) + if include_uses_unix_dir_aliases: + message += ". Relative paths like . and .. are not allowed." - error(filename, first_include, 'build/include', 5, - '%s should include its header file %s' % (fileinfo.RepositoryName(), - headername)) + error(filename, first_include, 'build/include', 5, message) def CheckForBadCharacters(filename, lines, error): @@ -2761,7 +3162,7 @@ def Update(self, filename, clean_lines, linenum, error): # }; class_decl_match = Match( r'^(\s*(?:template\s*<[\w\s<>,:=]*>\s*)?' - r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' + r'(class|struct)\s+(?:[a-zA-Z0-9_]+\s+)*(\w+(?:::\w+)*))' r'(.*)$', line) if (class_decl_match and (not self.stack or self.stack[-1].open_parentheses == 0)): @@ -2983,7 +3384,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, # Look for single-argument constructors that aren't marked explicit. # Technically a valid construct, but against style. explicit_constructor_match = Match( - r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*' + r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?' + r'(?:(?:inline|constexpr)\s+)*%s\s*' r'\(((?:[^()]|\([^()]*\))*)\)' % re.escape(base_classname), line) @@ -3028,7 +3430,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) copy_constructor = bool( onearg_constructor and - Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' + Match(r'((const\s+(volatile\s+)?)?|(volatile\s+(const\s+)?))?' + r'%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' % re.escape(base_classname), constructor_args[0].strip())) if (not is_marked_explicit and @@ -3087,7 +3490,7 @@ def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): # Note that we assume the contents of [] to be short enough that # they'll never need to wrap. if ( # Ignore control structures. - not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + not Search(r'\b(if|elif|for|while|switch|return|new|delete|catch|sizeof)\b', fncall) and # Ignore pointers/references to functions. not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and @@ -3103,8 +3506,6 @@ def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and - not Search(r'\b(' + '|'.join(_ALT_TOKEN_REPLACEMENT.keys()) + r')\b\s+\(', - fncall) and not Search(r'\bcase\s+\(', fncall)): # TODO(unknown): Space after an operator function seem to be a common # error, silence those for now by restricting them to highest verbosity. @@ -3196,13 +3597,13 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, if starting_func: body_found = False - for start_linenum in range(linenum, clean_lines.NumLines()): + for start_linenum in xrange(linenum, clean_lines.NumLines()): start_line = lines[start_linenum] joined_line += ' ' + start_line.lstrip() if Search(r'(;|})', start_line): # Declarations and trivial functions body_found = True break # ... ignore - elif Search(r'{', start_line): + if Search(r'{', start_line): body_found = True function = Search(r'((\w|:)*)\(', line).group(1) if Match(r'TEST', function): # Handle TEST... macros @@ -3281,36 +3682,6 @@ def CheckComment(line, filename, linenum, next_line_start, error): 'Should have a space between // and comment') -def CheckAccess(filename, clean_lines, linenum, nesting_state, error): - """Checks for improper use of DISALLOW* macros. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] # get rid of comments and strings - - matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' - r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) - if not matched: - return - if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): - if nesting_state.stack[-1].access != 'private': - error(filename, linenum, 'readability/constructors', 3, - '%s must be in the private: section' % matched.group(1)) - - else: - # Found DISALLOW* macro outside a class declaration, or perhaps it - # was used inside a function when it should have been part of the - # class declaration. We could issue a warning here, but it - # probably resulted in a compiler error already. - pass - - def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): """Checks for the correctness of various spacing issues in the code. @@ -3425,9 +3796,10 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # get rid of comments and strings line = clean_lines.elided[linenum] - # You shouldn't have spaces before your brackets, except maybe after - # 'delete []' or 'return []() {};' - if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line): + # You shouldn't have spaces before your brackets, except for C++11 attributes + # or maybe after 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'. + if (Search(r'\w\s+\[(?!\[)', line) and + not Search(r'(?:auto&?|delete|return)\s+\[', line)): error(filename, linenum, 'whitespace/braces', 5, 'Extra space before [') @@ -3945,11 +4317,11 @@ def CheckBraces(filename, clean_lines, linenum, error): # its line, and the line after that should have an indent level equal to or # lower than the if. We also check for ambiguous if/else nesting without # braces. - if_else_match = Search(r'\b(if\s*\(|else\b)', line) + if_else_match = Search(r'\b(if\s*(|constexpr)\s*\(|else\b)', line) if if_else_match and not Match(r'\s*#', line): if_indent = GetIndentLevel(line) endline, endlinenum, endpos = line, linenum, if_else_match.end() - if_match = Search(r'\bif\s*\(', line) + if_match = Search(r'\bif\s*(|constexpr)\s*\(', line) if if_match: # This could be a multiline if condition, so find the end first. pos = if_match.end() - 1 @@ -4008,9 +4380,9 @@ def CheckTrailingSemicolon(filename, clean_lines, linenum, error): # Block bodies should not be followed by a semicolon. Due to C++11 # brace initialization, there are more places where semicolons are - # required than not, so we use explicitly list the allowed rules - # rather than listing the disallowed ones. These are the places - # where "};" should be replaced by just "}": + # required than not, so we explicitly list the allowed rules rather + # than listing the disallowed ones. These are the places where "};" + # should be replaced by just "}": # 1. Some flavor of block following closing parenthesis: # for (;;) {}; # while (...) {}; @@ -4434,6 +4806,16 @@ def GetLineWidth(line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): + # Issue 337 + # https://mail.python.org/pipermail/python-list/2012-August/628809.html + if (sys.version_info.major, sys.version_info.minor) <= (3, 2): + # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81 + is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4 + # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564 + is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF + if not is_wide_build and is_low_surrogate: + width -= 1 + width += 1 return width else: @@ -4481,7 +4863,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # if(match($0, " <<")) complain = 0; # if(match(prev, " +for \\(")) complain = 0; # if(prevodd && match(prevprev, " +for \\(")) complain = 0; - scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' + scope_or_label_pattern = r'\s*(?:public|private|protected|signals)(?:\s+(?:slots\s*)?)?:\s*\\?$' classinfo = nesting_state.InnermostClass() initial_spaces = 0 cleansed_line = clean_lines.elided[linenum] @@ -4507,7 +4889,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # Check if the line is a header guard. is_header_guard = False - if file_extension in GetHeaderExtensions(): + if IsHeaderExtension(file_extension): cppvar = GetHeaderGuardCPPVariable(filename) if (line.startswith('#ifndef %s' % cppvar) or line.startswith('#define %s' % cppvar) or @@ -4553,7 +4935,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, CheckBraces(filename, clean_lines, linenum, error) CheckTrailingSemicolon(filename, clean_lines, linenum, error) CheckEmptyBlockBody(filename, clean_lines, linenum, error) - CheckAccess(filename, clean_lines, linenum, nesting_state, error) CheckSpacing(filename, clean_lines, linenum, nesting_state, error) CheckOperatorSpacing(filename, clean_lines, linenum, error) CheckParenthesisSpacing(filename, clean_lines, linenum, error) @@ -4606,13 +4987,14 @@ def _DropCommonSuffixes(filename): return os.path.splitext(filename)[0] -def _ClassifyInclude(fileinfo, include, is_system): +def _ClassifyInclude(fileinfo, include, used_angle_brackets, include_order="default"): """Figures out what kind of header 'include' is. Args: fileinfo: The current file cpplint is running over. A FileInfo instance. include: The path to a #included file. - is_system: True if the #include used <> rather than "". + used_angle_brackets: True if the #include used <> rather than "". + include_order: "default" or other value allowed in program arguments Returns: One of the _XXX_HEADER constants. @@ -4622,6 +5004,8 @@ def _ClassifyInclude(fileinfo, include, is_system): _C_SYS_HEADER >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) _CPP_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', True, "standardcfirst") + _OTHER_SYS_HEADER >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) _LIKELY_MY_HEADER >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), @@ -4632,17 +5016,24 @@ def _ClassifyInclude(fileinfo, include, is_system): """ # This is a list of all standard c++ header files, except # those already checked for above. - is_cpp_h = include in _CPP_HEADERS + is_cpp_header = include in _CPP_HEADERS + + # Mark include as C header if in list or in a known folder for standard-ish C headers. + is_std_c_header = (include_order == "default") or (include in _C_HEADERS + # additional linux glibc header folders + or Search(r'(?:%s)\/.*\.h' % "|".join(C_STANDARD_HEADER_FOLDERS), include)) # Headers with C++ extensions shouldn't be considered C system headers - if is_system and os.path.splitext(include)[1] in ['.hpp', '.hxx', '.h++']: - is_system = False + include_ext = os.path.splitext(include)[1] + is_system = used_angle_brackets and not include_ext in ['.hh', '.hpp', '.hxx', '.h++'] if is_system: - if is_cpp_h: + if is_cpp_header: return _CPP_SYS_HEADER - else: + if is_std_c_header: return _C_SYS_HEADER + else: + return _OTHER_SYS_HEADER # If the target file and the include we're checking share a # basename when we drop common extensions, and the include @@ -4696,10 +5087,12 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): # # We also make an exception for Lua headers, which follow google # naming convention but not the include convention. - match = Match(r'#include\s*"([^/]+\.h)"', line) - if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): - error(filename, linenum, 'build/include_subdir', 4, - 'Include the directory when naming .h files') + match = Match(r'#include\s*"([^/]+\.(.*))"', line) + if match: + if (IsHeaderExtension(match.group(2)) and + not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1))): + error(filename, linenum, 'build/include_subdir', 4, + 'Include the directory when naming header files') # we shouldn't include a file more than once. actually, there are a # handful of instances where doing so is okay, but in general it's @@ -4707,7 +5100,7 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): match = _RE_PATTERN_INCLUDE.search(line) if match: include = match.group(2) - is_system = (match.group(1) == '<') + used_angle_brackets = (match.group(1) == '<') duplicate_line = include_state.FindHeader(include) if duplicate_line >= 0: error(filename, linenum, 'build/include', 4, @@ -4722,7 +5115,19 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): 'Do not include .' + extension + ' files from other packages') return - if not _THIRD_PARTY_HEADERS_PATTERN.match(include): + # We DO want to include a 3rd party looking header if it matches the + # filename. Otherwise we get an erroneous error "...should include its + # header" error later. + third_src_header = False + for ext in GetHeaderExtensions(): + basefilename = filename[0:len(filename) - len(fileinfo.Extension())] + headerfile = basefilename + '.' + ext + headername = FileInfo(headerfile).RepositoryName() + if headername in include or include in headername: + third_src_header = True + break + + if third_src_header or not _THIRD_PARTY_HEADERS_PATTERN.match(include): include_state.include_list[-1].append((include, linenum)) # We want to ensure that headers appear in the right order: @@ -4737,7 +5142,7 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): # track of the highest type seen, and complains if we see a # lower type after that. error_message = include_state.CheckNextIncludeOrder( - _ClassifyInclude(fileinfo, include, is_system)) + _ClassifyInclude(fileinfo, include, used_angle_brackets, _include_order)) if error_message: error(filename, linenum, 'build/include_order', 4, '%s. Should be: %s.h, c system, c++ system, other.' % @@ -4876,7 +5281,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, CheckGlobalStatic(filename, clean_lines, linenum, error) CheckPrintf(filename, clean_lines, linenum, error) - if file_extension in GetHeaderExtensions(): + if IsHeaderExtension(file_extension): # TODO(unknown): check that 1-arg constructors are explicit. # How to tell it's a constructor? # (handled in CheckForNonStandardConstructs for now) @@ -4988,10 +5393,10 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, # Check for use of unnamed namespaces in header files. Registration # macros are typically OK, so we allow use of "namespace {" on lines # that end with backslashes. - if (file_extension in GetHeaderExtensions() + if (IsHeaderExtension(file_extension) and Search(r'\bnamespace\s*{', line) and line[-1] != '\\'): - error(filename, linenum, 'build/namespaces', 4, + error(filename, linenum, 'build/namespaces_headers', 4, 'Do not use unnamed namespaces in header files. See ' 'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' ' for more information.') @@ -5282,9 +5687,9 @@ def CheckForNonConstReference(filename, clean_lines, linenum, # We also accept & in static_assert, which looks like a function but # it's actually a declaration expression. allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|' - r'operator\s*[<>][<>]|' - r'static_assert|COMPILE_ASSERT' - r')\s*\(') + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') if Search(allowed_functions, line): return elif not Search(r'\S+\([^)]*$', line): @@ -5368,7 +5773,7 @@ def CheckCasts(filename, clean_lines, linenum, error): if not expecting_function: CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', - r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + r'\((int|float|double|bool|char|u?int(16|32|64)|size_t)\)', error) # This doesn't catch all cases. Consider (const char * const)"hello". # @@ -5460,7 +5865,8 @@ def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): return False # operator++(int) and operator--(int) - if context.endswith(' operator++') or context.endswith(' operator--'): + if (context.endswith(' operator++') or context.endswith(' operator--') or + context.endswith('::operator++') or context.endswith('::operator--')): return False # A single unnamed argument for a function tends to look like old style cast. @@ -5521,11 +5927,11 @@ def ExpectingFunctionArgs(clean_lines, linenum): )), ('', ('numeric_limits',)), ('', ('list',)), - ('', ('map', 'multimap',)), + ('', ('multimap',)), ('', ('allocator', 'make_shared', 'make_unique', 'shared_ptr', 'unique_ptr', 'weak_ptr')), ('', ('queue', 'priority_queue',)), - ('', ('set', 'multiset',)), + ('', ('multiset',)), ('', ('stack',)), ('', ('char_traits', 'basic_string',)), ('', ('tuple',)), @@ -5554,11 +5960,21 @@ def ExpectingFunctionArgs(clean_lines, linenum): for _header, _templates in _HEADERS_MAYBE_TEMPLATES: for _template in _templates: # Match max(..., ...), max(..., ...), but not foo->max, foo.max or - # type::max(). + # 'type::max()'. _re_pattern_headers_maybe_templates.append( (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), _template, _header)) +# Match set, but not foo->set, foo.set +_re_pattern_headers_maybe_templates.append( + (re.compile(r'[^>.]\bset\s*\<'), + 'set<>', + '')) +# Match 'map var' and 'std::map(...)', but not 'map(...)'' +_re_pattern_headers_maybe_templates.append( + (re.compile(r'(std\b::\bmap\s*\<)|(^(std\b::\b)map\b\(\s*\<)'), + 'map<>', + '')) # Other scripts may reach in and modify this pattern. _re_pattern_templates = [] @@ -5604,7 +6020,7 @@ def FilesBelongToSameModule(filename_cc, filename_h): return (False, '') fileinfo_h = FileInfo(filename_h) - if not fileinfo_h.Extension().lstrip('.') in GetHeaderExtensions(): + if not IsHeaderExtension(fileinfo_h.Extension().lstrip('.')): return (False, '') filename_cc = filename_cc[:-(len(fileinfo_cc.Extension()))] @@ -5641,18 +6057,19 @@ def UpdateIncludeState(filename, include_dict, io=codecs): """ headerfile = None try: - headerfile = io.open(filename, 'r', 'utf8', 'replace') + with io.open(filename, 'r', 'utf8', 'replace') as headerfile: + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + include_dict.setdefault(include, linenum) + return True except IOError: return False - linenum = 0 - for line in headerfile: - linenum += 1 - clean_line = CleanseComments(line) - match = _RE_PATTERN_INCLUDE.search(clean_line) - if match: - include = match.group(2) - include_dict.setdefault(include, linenum) - return True + def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, @@ -5676,7 +6093,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, required = {} # A map of header name to linenumber and the template entity. # Example of required: { '': (1219, 'less<>') } - for linenum in range(clean_lines.NumLines()): + for linenum in xrange(clean_lines.NumLines()): line = clean_lines.elided[linenum] if not line or line[0] == '#': continue @@ -6082,10 +6499,10 @@ def ProcessFileData(filename, file_extension, lines, error, RemoveMultiLineComments(filename, lines, error) clean_lines = CleansedLines(lines) - if file_extension in GetHeaderExtensions(): + if IsHeaderExtension(file_extension): CheckForHeaderGuard(filename, clean_lines, error) - for line in range(clean_lines.NumLines()): + for line in xrange(clean_lines.NumLines()): ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, nesting_state, error, extra_check_functions) @@ -6128,7 +6545,7 @@ def ProcessConfigOverrides(filename): continue try: - with open(cfg_file) as file_handle: + with codecs.open(cfg_file, 'r', 'utf8', 'replace') as file_handle: for line in file_handle: line, _, _ = line.partition('#') # Remove comments. if not line.strip(): @@ -6151,37 +6568,30 @@ def ProcessConfigOverrides(filename): if base_name: pattern = re.compile(val) if pattern.match(base_name): - _cpplint_state.PrintInfo('Ignoring "%s": file excluded by ' - '"%s". File path component "%s" matches pattern "%s"\n' % - (filename, cfg_file, base_name, val)) + if _cpplint_state.quiet: + # Suppress "Ignoring file" warning when using --quiet. + return False + _cpplint_state.PrintInfo('Ignoring "%s": file excluded by "%s". ' + 'File path component "%s" matches ' + 'pattern "%s"\n' % + (filename, cfg_file, base_name, val)) return False elif name == 'linelength': global _line_length try: - _line_length = int(val) + _line_length = int(val) except ValueError: - _cpplint_state.PrintError('Line length must be numeric.') + _cpplint_state.PrintError('Line length must be numeric.') elif name == 'extensions': - global _valid_extensions - try: - extensions = [ext.strip() for ext in val.split(',')] - _valid_extensions = set(extensions) - except ValueError: - sys.stderr.write('Extensions should be a comma-separated list of values;' - 'for example: extensions=hpp,cpp\n' - 'This could not be parsed: "%s"' % (val,)) - elif name == 'headers': - global _header_extensions - try: - extensions = [ext.strip() for ext in val.split(',')] - _header_extensions = set(extensions) - except ValueError: - sys.stderr.write('Extensions should be a comma-separated list of values;' - 'for example: extensions=hpp,cpp\n' - 'This could not be parsed: "%s"' % (val,)) + ProcessExtensionsOption(val) elif name == 'root': global _root - _root = val + # root directories are specified relative to CPPLINT.cfg dir. + _root = os.path.join(os.path.dirname(cfg_file), val) + elif name == 'headers': + ProcessHppHeadersOption(val) + elif name == 'includeorder': + ProcessIncludeOrderOption(val) else: _cpplint_state.PrintError( 'Invalid configuration option (%s) in file %s\n' % @@ -6195,7 +6605,7 @@ def ProcessConfigOverrides(filename): # Apply all the accumulated filters in reverse order (top-level directory # config options having the least priority). for cfg_filter in reversed(cfg_filters): - _AddFilters(cfg_filter) + _AddFilters(cfg_filter) return True @@ -6216,6 +6626,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=None): _SetVerboseLevel(vlevel) _BackupFilters() + old_errors = _cpplint_state.error_count if not ProcessConfigOverrides(filename): _RestoreFilters() @@ -6237,7 +6648,8 @@ def ProcessFile(filename, vlevel, extra_check_functions=None): codecs.getwriter('utf8'), 'replace').read().split('\n') else: - lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') + with codecs.open(filename, 'r', 'utf8', 'replace') as target_file: + lines = target_file.read().split('\n') # Remove trailing '\r'. # The -1 accounts for the extra trailing blank line we get from split() @@ -6284,7 +6696,10 @@ def ProcessFile(filename, vlevel, extra_check_functions=None): Error(filename, linenum, 'whitespace/newline', 1, 'Unexpected \\r (^M) found; better to use only \\n') - _cpplint_state.PrintInfo('Done processing %s\n' % filename) + # Suppress printing anything if --quiet was passed unless the error + # count has increased after processing this file. + if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count: + _cpplint_state.PrintInfo('Done processing %s\n' % filename) _RestoreFilters() @@ -6294,13 +6709,21 @@ def PrintUsage(message): Args: message: The optional error message. """ - sys.stderr.write(_USAGE) + sys.stderr.write(_USAGE % (sorted(list(GetAllExtensions())), + ','.join(sorted(list(GetAllExtensions()))), + sorted(GetHeaderExtensions()), + ','.join(sorted(GetHeaderExtensions())))) if message: sys.exit('\nFATAL ERROR: ' + message) else: sys.exit(0) +def PrintVersion(): + sys.stdout.write('Cpplint fork (https://github.com/cpplint/cpplint)\n') + sys.stdout.write('cpplint ' + __VERSION__ + '\n') + sys.stdout.write('Python ' + sys.version + '\n') + sys.exit(0) def PrintCategories(): """Prints a list of all the error-categories used by error messages. @@ -6324,6 +6747,8 @@ def ParseArguments(args): """ try: (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', + 'v=', + 'version', 'counting=', 'filter=', 'root=', @@ -6331,27 +6756,33 @@ def ParseArguments(args): 'linelength=', 'extensions=', 'exclude=', + 'recursive', 'headers=', - 'quiet', - 'recursive']) + 'includeorder=', + 'quiet']) except getopt.GetoptError: PrintUsage('Invalid arguments.') verbosity = _VerboseLevel() output_format = _OutputFormat() filters = '' + quiet = _Quiet() counting_style = '' recursive = False for (opt, val) in opts: if opt == '--help': PrintUsage(None) + if opt == '--version': + PrintVersion() elif opt == '--output': - if val not in ('emacs', 'vs7', 'eclipse', 'junit'): + if val not in ('emacs', 'vs7', 'eclipse', 'junit', 'sed', 'gsed'): PrintUsage('The only allowed output formats are emacs, vs7, eclipse ' - 'and junit.') + 'sed, gsed and junit.') output_format = val - elif opt == '--verbose': + elif opt == '--quiet': + quiet = True + elif opt == '--verbose' or opt == '--v': verbosity = int(val) elif opt == '--filter': filters = val @@ -6379,22 +6810,13 @@ def ParseArguments(args): _excludes = set() _excludes.update(glob.glob(val)) elif opt == '--extensions': - global _valid_extensions - try: - _valid_extensions = set(val.split(',')) - except ValueError: - PrintUsage('Extensions must be comma separated list.') + ProcessExtensionsOption(val) elif opt == '--headers': - global _header_extensions - try: - _header_extensions = set(val.split(',')) - except ValueError: - PrintUsage('Extensions must be comma separated list.') + ProcessHppHeadersOption(val) elif opt == '--recursive': recursive = True - elif opt == '--quiet': - global _quiet - _quiet = True + elif opt == '--includeorder': + ProcessIncludeOrderOption(val) if not filenames: PrintUsage('No files were specified.') @@ -6406,10 +6828,12 @@ def ParseArguments(args): filenames = _FilterExcludedFiles(filenames) _SetOutputFormat(output_format) + _SetQuiet(quiet) _SetVerboseLevel(verbosity) _SetFilters(filters) _SetCountingStyle(counting_style) + filenames.sort() return filenames def _ExpandDirectories(filenames): @@ -6426,30 +6850,50 @@ def _ExpandDirectories(filenames): """ expanded = set() for filename in filenames: - if not os.path.isdir(filename): - expanded.add(filename) - continue + if not os.path.isdir(filename): + expanded.add(filename) + continue - for root, _, files in os.walk(filename): - for loopfile in files: - fullname = os.path.join(root, loopfile) - if fullname.startswith('.' + os.path.sep): - fullname = fullname[len('.' + os.path.sep):] - expanded.add(fullname) + for root, _, files in os.walk(filename): + for loopfile in files: + fullname = os.path.join(root, loopfile) + if fullname.startswith('.' + os.path.sep): + fullname = fullname[len('.' + os.path.sep):] + expanded.add(fullname) filtered = [] for filename in expanded: - if os.path.splitext(filename)[1][1:] in GetAllExtensions(): - filtered.append(filename) - + if os.path.splitext(filename)[1][1:] in GetAllExtensions(): + filtered.append(filename) return filtered -def _FilterExcludedFiles(filenames): +def _FilterExcludedFiles(fnames): """Filters out files listed in the --exclude command line switch. File paths in the switch are evaluated relative to the current working directory """ exclude_paths = [os.path.abspath(f) for f in _excludes] - return [f for f in filenames if os.path.abspath(f) not in exclude_paths] + # because globbing does not work recursively, exclude all subpath of all excluded entries + return [f for f in fnames + if not any(e for e in exclude_paths + if _IsParentOrSame(e, os.path.abspath(f)))] + +def _IsParentOrSame(parent, child): + """Return true if child is subdirectory of parent. + Assumes both paths are absolute and don't contain symlinks. + """ + parent = os.path.normpath(parent) + child = os.path.normpath(child) + if parent == child: + return True + + prefix = os.path.commonprefix([parent, child]) + if prefix != parent: + return False + # Note: os.path.commonprefix operates on character basis, so + # take extra care of situations like '/foo/ba' and '/foo/bar/baz' + child_suffix = child[len(prefix):] + child_suffix = child_suffix.lstrip(os.sep) + return child == os.path.join(prefix, child_suffix) def main(): filenames = ParseArguments(sys.argv[1:]) @@ -6462,7 +6906,9 @@ def main(): _cpplint_state.ResetErrorCounts() for filename in filenames: ProcessFile(filename, _cpplint_state.verbose_level) - _cpplint_state.PrintErrorCounts() + # If --quiet is passed, suppress printing error count unless there are errors. + if not _cpplint_state.quiet or _cpplint_state.error_count > 0: + _cpplint_state.PrintErrorCounts() if _cpplint_state.output_format == 'junit': sys.stderr.write(_cpplint_state.FormatJUnitXML()) diff --git a/cpp/build-support/lint_exclusions.txt b/cpp/build-support/lint_exclusions.txt index 73cbd884f44..195c3dee36a 100644 --- a/cpp/build-support/lint_exclusions.txt +++ b/cpp/build-support/lint_exclusions.txt @@ -1,5 +1,7 @@ -*_generated* *.grpc.fb.* +*.pb.* +*RcppExports.cpp* +*_generated* *arrowExports.cpp* *parquet_constants.* *parquet_types.* @@ -7,7 +9,6 @@ *pyarrow_lib.h *python/config.h *python/platform.h -*RcppExports.cpp* *thirdparty/* *vendored/* *windows_compatibility.h diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index d2d327cfddd..7f68abd31c3 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -109,8 +109,7 @@ function run_test() { # XML output from gtest. We assume that gtest knows better than us and our # regexes in most cases, but for certain errors we delete the resulting xml # file and let our own post-processing step regenerate it. - export GREP=$(which egrep) - if zgrep --silent "ThreadSanitizer|Leak check.*detected leaks" $LOGFILE ; then + if grep -E -q "ThreadSanitizer|Leak check.*detected leaks" $LOGFILE ; then echo ThreadSanitizer or leak check failures in $LOGFILE STATUS=1 rm -f $XMLFILE @@ -157,7 +156,7 @@ function post_process_tests() { # If we have a LeakSanitizer report, and XML reporting is configured, add a new test # case result to the XML file for the leak report. Otherwise Jenkins won't show # us which tests had LSAN errors. - if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then + if grep -E -q "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then echo Test had memory leaks. Editing XML perl -p -i -e ' if (m##) { diff --git a/cpp/build-support/update-flatbuffers.sh b/cpp/build-support/update-flatbuffers.sh index b1116a1cbf7..52da752068a 100755 --- a/cpp/build-support/update-flatbuffers.sh +++ b/cpp/build-support/update-flatbuffers.sh @@ -33,9 +33,6 @@ OUT_DIR="$SOURCE_DIR/generated" FILES=($(find $FORMAT_DIR -name '*.fbs')) FILES+=("$SOURCE_DIR/arrow/ipc/feather.fbs") -# add compute ir files -FILES+=($(find "$TOP/experimental/computeir" -name '*.fbs')) - $FLATC --cpp --cpp-std c++11 \ --scoped-enums \ -o "$OUT_DIR" \ diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 888ca19af58..89172ccf66e 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -73,7 +73,7 @@ endfunction() # Based on MIT-licensed # https://gist.github.com/cristianadam/ef920342939a89fae3e8a85ca9459b49 -function(create_merged_static_lib output_target) +function(arrow_create_merged_static_lib output_target) set(options) set(one_value_args NAME ROOT) set(multi_value_args TO_MERGE) @@ -136,17 +136,37 @@ function(create_merged_static_lib output_target) message(FATAL_ERROR "Unknown bundle scenario!") endif() - add_custom_command(COMMAND ${BUNDLE_COMMAND} - OUTPUT ${output_lib_path} - COMMENT "Bundling ${output_lib_path}" - VERBATIM) + add_custom_target(${output_target}_merge ALL + ${BUNDLE_COMMAND} + DEPENDS ${ARG_ROOT} ${ARG_TO_MERGE} + BYPRODUCTS ${output_lib_path} + COMMENT "Bundling ${output_lib_path}" + VERBATIM) message(STATUS "Creating bundled static library target ${output_target} at ${output_lib_path}" ) - add_custom_target(${output_target} ALL DEPENDS ${output_lib_path}) - add_dependencies(${output_target} ${ARG_ROOT} ${ARG_TO_MERGE}) - install(FILES ${output_lib_path} DESTINATION ${CMAKE_INSTALL_LIBDIR}) + add_library(${output_target} STATIC IMPORTED) + set_target_properties(${output_target} PROPERTIES IMPORTED_LOCATION ${output_lib_path}) + add_dependencies(${output_target} ${output_target}_merge) +endfunction() + +function(arrow_install_cmake_package PACKAGE_NAME EXPORT_NAME) + set(CONFIG_CMAKE "${PACKAGE_NAME}Config.cmake") + set(BUILT_CONFIG_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_CMAKE}") + configure_package_config_file("${CONFIG_CMAKE}.in" "${BUILT_CONFIG_CMAKE}" + INSTALL_DESTINATION "${ARROW_CMAKE_DIR}/${PACKAGE_NAME}") + set(CONFIG_VERSION_CMAKE "${PACKAGE_NAME}ConfigVersion.cmake") + set(BUILT_CONFIG_VERSION_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_VERSION_CMAKE}") + write_basic_package_version_file("${BUILT_CONFIG_VERSION_CMAKE}" + COMPATIBILITY SameMajorVersion) + install(FILES "${BUILT_CONFIG_CMAKE}" "${BUILT_CONFIG_VERSION_CMAKE}" + DESTINATION "${ARROW_CMAKE_DIR}/${PACKAGE_NAME}") + set(TARGETS_CMAKE "${PACKAGE_NAME}Targets.cmake") + install(EXPORT ${EXPORT_NAME} + DESTINATION "${ARROW_CMAKE_DIR}/${PACKAGE_NAME}" + NAMESPACE "${PACKAGE_NAME}::" + FILE "${TARGETS_CMAKE}") endfunction() # \arg OUTPUTS list to append built targets to @@ -156,9 +176,12 @@ function(ADD_ARROW_LIB LIB_NAME) BUILD_SHARED BUILD_STATIC CMAKE_PACKAGE_NAME + INSTALL_ARCHIVE_DIR + INSTALL_LIBRARY_DIR + INSTALL_RUNTIME_DIR PKG_CONFIG_NAME - SHARED_LINK_FLAGS - PRECOMPILED_HEADER_LIB) + PRECOMPILED_HEADER_LIB + SHARED_LINK_FLAGS) set(multi_value_args SOURCES PRECOMPILED_HEADERS @@ -169,6 +192,7 @@ function(ADD_ARROW_LIB LIB_NAME) EXTRA_INCLUDES PRIVATE_INCLUDES DEPENDENCIES + DEFINITIONS SHARED_INSTALL_INTERFACE_LIBS STATIC_INSTALL_INTERFACE_LIBS OUTPUT_PATH) @@ -227,6 +251,9 @@ function(ADD_ARROW_LIB LIB_NAME) if(ARG_DEPENDENCIES) add_dependencies(${LIB_NAME}_objlib ${ARG_DEPENDENCIES}) endif() + if(ARG_DEFINITIONS) + target_compile_definitions(${LIB_NAME}_objlib PRIVATE ${ARG_DEFINITIONS}) + endif() if(ARG_PRECOMPILED_HEADER_LIB) reuse_precompiled_header_lib(${LIB_NAME}_objlib ${ARG_PRECOMPILED_HEADER_LIB}) endif() @@ -234,7 +261,6 @@ function(ADD_ARROW_LIB LIB_NAME) target_precompile_headers(${LIB_NAME}_objlib PRIVATE ${ARG_PRECOMPILED_HEADERS}) endif() set(LIB_DEPS $) - set(LIB_INCLUDES) set(EXTRA_DEPS) if(ARG_OUTPUTS) @@ -247,21 +273,45 @@ function(ADD_ARROW_LIB LIB_NAME) if(ARG_PRIVATE_INCLUDES) target_include_directories(${LIB_NAME}_objlib PRIVATE ${ARG_PRIVATE_INCLUDES}) endif() - target_link_libraries(${LIB_NAME}_objlib - PRIVATE ${ARG_SHARED_LINK_LIBS} ${ARG_SHARED_PRIVATE_LINK_LIBS} - ${ARG_STATIC_LINK_LIBS}) + if(BUILD_SHARED) + if(ARG_SHARED_LINK_LIBS) + target_link_libraries(${LIB_NAME}_objlib PRIVATE ${ARG_SHARED_LINK_LIBS}) + endif() + if(ARG_SHARED_PRIVATE_LINK_LIBS) + target_link_libraries(${LIB_NAME}_objlib PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) + endif() + endif() + if(BUILD_STATIC AND ARG_STATIC_LINK_LIBS) + target_link_libraries(${LIB_NAME}_objlib PRIVATE ${ARG_STATIC_LINK_LIBS}) + endif() else() # Prepare arguments for separate compilation of static and shared libs below # TODO: add PCH directives set(LIB_DEPS ${ARG_SOURCES}) set(EXTRA_DEPS ${ARG_DEPENDENCIES}) + endif() - if(ARG_EXTRA_INCLUDES) - set(LIB_INCLUDES ${ARG_EXTRA_INCLUDES}) - endif() + if(ARG_EXTRA_INCLUDES) + set(LIB_INCLUDES ${ARG_EXTRA_INCLUDES}) + else() + set(LIB_INCLUDES "") endif() - set(RUNTIME_INSTALL_DIR bin) + if(ARG_INSTALL_ARCHIVE_DIR) + set(INSTALL_ARCHIVE_DIR ${ARG_INSTALL_ARCHIVE_DIR}) + else() + set(INSTALL_ARCHIVE_DIR ${CMAKE_INSTALL_LIBDIR}) + endif() + if(ARG_INSTALL_LIBRARY_DIR) + set(INSTALL_LIBRARY_DIR ${ARG_INSTALL_LIBRARY_DIR}) + else() + set(INSTALL_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) + endif() + if(ARG_INSTALL_RUNTIME_DIR) + set(INSTALL_RUNTIME_DIR ${ARG_INSTALL_RUNTIME_DIR}) + else() + set(INSTALL_RUNTIME_DIR bin) + endif() if(BUILD_SHARED) add_library(${LIB_NAME}_shared SHARED ${LIB_DEPS}) @@ -269,6 +319,10 @@ function(ADD_ARROW_LIB LIB_NAME) add_dependencies(${LIB_NAME}_shared ${EXTRA_DEPS}) endif() + if(ARG_DEFINITIONS) + target_compile_definitions(${LIB_NAME}_shared PRIVATE ${ARG_DEFINITIONS}) + endif() + if(ARG_PRECOMPILED_HEADER_LIB) reuse_precompiled_header_lib(${LIB_NAME}_shared ${ARG_PRECOMPILED_HEADER_LIB}) endif() @@ -309,11 +363,9 @@ function(ADD_ARROW_LIB LIB_NAME) SOVERSION "${ARROW_SO_VERSION}") target_link_libraries(${LIB_NAME}_shared - LINK_PUBLIC - "$" - "$" - LINK_PRIVATE - ${ARG_SHARED_PRIVATE_LINK_LIBS}) + PUBLIC "$" + "$" + PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) if(USE_OBJLIB) # Ensure that dependencies are built before compilation of objects in @@ -348,9 +400,9 @@ function(ADD_ARROW_LIB LIB_NAME) install(TARGETS ${LIB_NAME}_shared ${INSTALL_IS_OPTIONAL} EXPORT ${LIB_NAME}_targets - RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${INSTALL_ARCHIVE_DIR} + LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR} + RUNTIME DESTINATION ${INSTALL_RUNTIME_DIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() @@ -361,6 +413,10 @@ function(ADD_ARROW_LIB LIB_NAME) add_dependencies(${LIB_NAME}_static ${EXTRA_DEPS}) endif() + if(ARG_DEFINITIONS) + target_compile_definitions(${LIB_NAME}_static PRIVATE ${ARG_DEFINITIONS}) + endif() + if(ARG_PRECOMPILED_HEADER_LIB) reuse_precompiled_header_lib(${LIB_NAME}_static ${ARG_PRECOMPILED_HEADER_LIB}) endif() @@ -394,13 +450,14 @@ function(ADD_ARROW_LIB LIB_NAME) OUTPUT_NAME ${LIB_NAME_STATIC}) if(ARG_STATIC_INSTALL_INTERFACE_LIBS) - target_link_libraries(${LIB_NAME}_static LINK_PUBLIC - "$") + target_link_libraries(${LIB_NAME}_static + INTERFACE "$" + ) endif() if(ARG_STATIC_LINK_LIBS) - target_link_libraries(${LIB_NAME}_static LINK_PRIVATE - "$") + target_link_libraries(${LIB_NAME}_static + PUBLIC "$") if(USE_OBJLIB) # Ensure that dependencies are built before compilation of objects in # object library, rather than only before the final link step @@ -414,34 +471,15 @@ function(ADD_ARROW_LIB LIB_NAME) install(TARGETS ${LIB_NAME}_static ${INSTALL_IS_OPTIONAL} EXPORT ${LIB_NAME}_targets - RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${INSTALL_ARCHIVE_DIR} + LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR} + RUNTIME DESTINATION ${INSTALL_RUNTIME_DIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(ARG_CMAKE_PACKAGE_NAME) - arrow_install_cmake_find_module("${ARG_CMAKE_PACKAGE_NAME}") - - set(TARGETS_CMAKE "${ARG_CMAKE_PACKAGE_NAME}Targets.cmake") - install(EXPORT ${LIB_NAME}_targets - FILE "${TARGETS_CMAKE}" - DESTINATION "${ARROW_CMAKE_DIR}") - - set(CONFIG_CMAKE "${ARG_CMAKE_PACKAGE_NAME}Config.cmake") - set(BUILT_CONFIG_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_CMAKE}") - configure_package_config_file("${CONFIG_CMAKE}.in" "${BUILT_CONFIG_CMAKE}" - INSTALL_DESTINATION "${ARROW_CMAKE_DIR}") - install(FILES "${BUILT_CONFIG_CMAKE}" DESTINATION "${ARROW_CMAKE_DIR}") - - set(CONFIG_VERSION_CMAKE "${ARG_CMAKE_PACKAGE_NAME}ConfigVersion.cmake") - set(BUILT_CONFIG_VERSION_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_VERSION_CMAKE}") - write_basic_package_version_file( - "${BUILT_CONFIG_VERSION_CMAKE}" - VERSION ${${PROJECT_NAME}_VERSION} - COMPATIBILITY AnyNewerVersion) - install(FILES "${BUILT_CONFIG_VERSION_CMAKE}" DESTINATION "${ARROW_CMAKE_DIR}") + arrow_install_cmake_package(${ARG_CMAKE_PACKAGE_NAME} ${LIB_NAME}_targets) endif() if(ARG_PKG_CONFIG_NAME) @@ -624,7 +662,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) LABELS EXTRA_LABELS TEST_ARGUMENTS - PREFIX) + PREFIX + DEFINITIONS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -695,6 +734,10 @@ function(ADD_TEST_CASE REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES}) endif() + if(ARG_DEFINITIONS) + target_compile_definitions(${TEST_NAME} PRIVATE ${ARG_DEFINITIONS}) + endif() + if(ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND) add_test(${TEST_NAME} bash @@ -846,7 +889,7 @@ function(ADD_FUZZ_TARGET REL_FUZZING_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - if(NO_FUZZING) + if(NOT ARROW_FUZZING) return() endif() @@ -901,16 +944,15 @@ function(ARROW_INSTALL_ALL_HEADERS PATH) endfunction() function(ARROW_ADD_PKG_CONFIG MODULE) - configure_file(${MODULE}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" @ONLY) + configure_file(${MODULE}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc.generate.in" + @ONLY) + file(GENERATE + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + INPUT "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc.generate.in") install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") endfunction() -function(ARROW_INSTALL_CMAKE_FIND_MODULE MODULE) - install(FILES "${ARROW_SOURCE_DIR}/cmake_modules/Find${MODULE}.cmake" - DESTINATION "${ARROW_CMAKE_DIR}") -endfunction() - # Implementations of lisp "car" and "cdr" functions macro(ARROW_CAR var) set(${var} ${ARGV1}) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index d5590a95ee4..040a6f58296 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -50,6 +50,18 @@ function(list_join lst glue out) endfunction() macro(define_option name description default) + set(options) + set(one_value_args) + set(multi_value_args DEPENDS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + check_description_length(${name} ${description}) list_join(description "\n" multiline_description) @@ -59,6 +71,7 @@ macro(define_option name description default) set("${name}_OPTION_DESCRIPTION" ${description}) set("${name}_OPTION_DEFAULT" ${default}) set("${name}_OPTION_TYPE" "bool") + set("${name}_OPTION_DEPENDS" ${ARG_DEPENDS}) endmacro() macro(define_option_string name description default) @@ -81,6 +94,48 @@ macro(define_option_string name description default) endif() endmacro() +# Topological sort by Tarjan's algorithm. +set(ARROW_BOOL_OPTION_DEPENDENCIES_TSORTED) +macro(tsort_bool_option_dependencies_visit option_name) + if("${${option_name}_TSORT_STATUS}" STREQUAL "VISITING") + message(FATAL_ERROR "Cyclic option dependency is detected: ${option_name}") + elseif("${${option_name}_TSORT_STATUS}" STREQUAL "") + set(${option_name}_TSORT_STATUS "VISITING") + foreach(needed_option_name ${${option_name}_OPTION_DEPENDS}) + tsort_bool_option_dependencies_visit(${needed_option_name}) + endforeach() + set(${option_name}_TSORT_STATUS "VISITED") + list(INSERT ARROW_BOOL_OPTION_DEPENDENCIES_TSORTED 0 ${option_name}) + endif() +endmacro() +macro(tsort_bool_option_dependencies) + foreach(category ${ARROW_OPTION_CATEGORIES}) + foreach(option_name ${ARROW_${category}_OPTION_NAMES}) + if("${${option_name}_OPTION_TYPE}" STREQUAL "bool") + if("${${option_name}_TSORT_STATUS}" STREQUAL "") + tsort_bool_option_dependencies_visit(${option_name}) + endif() + endif() + endforeach() + endforeach() +endmacro() + +macro(resolve_option_dependencies) + if(MSVC_TOOLCHAIN) + # Plasma using glog is not fully tested on windows. + set(ARROW_USE_GLOG OFF) + endif() + + tsort_bool_option_dependencies() + foreach(option_name ${ARROW_BOOL_OPTION_DEPENDENCIES_TSORTED}) + if(${${option_name}}) + foreach(depended_option_name ${${option_name}_OPTION_DEPENDS}) + set(${depended_option_name} ON) + endforeach() + endif() + endforeach() +endmacro() + # Top level cmake dir if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(ARROW_DEFINE_OPTIONS_DEFAULT ON) @@ -114,6 +169,9 @@ if(ARROW_DEFINE_OPTIONS) define_option(ARROW_USE_CCACHE "Use ccache when compiling (if available)" ON) + define_option(ARROW_USE_SCCACHE "Use sccache when compiling (if available),;\ +takes precedence over ccache if a storage backend is configured" ON) + define_option(ARROW_USE_LD_GOLD "Use ld.gold for linking on Linux (if available)" OFF) define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" @@ -127,6 +185,10 @@ if(ARROW_DEFINE_OPTIONS) "AVX2" "AVX512" "NEON" + "SVE" # size agnostic SVE + "SVE128" # fixed size SVE + "SVE256" # " + "SVE512" # " "DEFAULT") define_option_string(ARROW_RUNTIME_SIMD_LEVEL @@ -138,17 +200,6 @@ if(ARROW_DEFINE_OPTIONS) "AVX512" "MAX") - # Arm64 architectures and extensions can lead to exploding combinations. - # So set it directly through cmake command line. - # - # If you change this, you need to change the definition in - # python/CMakeLists.txt too. - define_option_string(ARROW_ARMV8_ARCH - "Arm64 arch and extensions" - "armv8-a" # Default - "armv8-a" - "armv8-a+crc+crypto") - define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON) define_option(ARROW_RPATH_ORIGIN "Build Arrow libraries with RATH set to \$ORIGIN" OFF) @@ -158,19 +209,34 @@ if(ARROW_DEFINE_OPTIONS) define_option(ARROW_GGDB_DEBUG "Pass -ggdb flag to debug builds" ON) + define_option(ARROW_WITH_MUSL "Whether the system libc is musl or not" OFF) + #---------------------------------------------------------------------- set_option_category("Test and benchmark") define_option(ARROW_BUILD_EXAMPLES "Build the Arrow examples" OFF) - define_option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" OFF) + define_option(ARROW_BUILD_TESTS + "Build the Arrow googletest unit tests" + OFF + DEPENDS + ARROW_IPC + ARROW_TESTING) define_option(ARROW_ENABLE_TIMING_TESTS "Enable timing-sensitive tests" ON) - define_option(ARROW_BUILD_INTEGRATION "Build the Arrow integration test executables" - OFF) + define_option(ARROW_BUILD_INTEGRATION + "Build the Arrow integration test executables" + OFF + DEPENDS + ARROW_TESTING) - define_option(ARROW_BUILD_BENCHMARKS "Build the Arrow micro benchmarks" OFF) + define_option(ARROW_BUILD_BENCHMARKS + "Build the Arrow micro benchmarks" + OFF + DEPENDS + ARROW_IPC + ARROW_TESTING) # Reference benchmarks are used to compare to naive implementation, or # discover various hardware limits. @@ -195,7 +261,11 @@ if(ARROW_DEFINE_OPTIONS) "shared" "static") - define_option(ARROW_FUZZING "Build Arrow Fuzzing executables" OFF) + define_option(ARROW_FUZZING + "Build Arrow Fuzzing executables" + OFF + DEPENDS + ARROW_TESTING) define_option(ARROW_LARGE_MEMORY_TESTS "Enable unit tests which use large memory" OFF) @@ -230,20 +300,39 @@ if(ARROW_DEFINE_OPTIONS) define_option(ARROW_CSV "Build the Arrow CSV Parser Module" OFF) - define_option(ARROW_CUDA "Build the Arrow CUDA extensions (requires CUDA toolkit)" OFF) - - define_option(ARROW_DATASET "Build the Arrow Dataset Modules" OFF) + define_option(ARROW_CUDA + "Build the Arrow CUDA extensions (requires CUDA toolkit)" + OFF + DEPENDS + ARROW_IPC) - define_option(ARROW_SUBSTRAIT "Build the Arrow Substrait Consumer Module" OFF) + define_option(ARROW_DATASET + "Build the Arrow Dataset Modules" + OFF + DEPENDS + ARROW_COMPUTE + ARROW_FILESYSTEM) define_option(ARROW_FILESYSTEM "Build the Arrow Filesystem Layer" OFF) define_option(ARROW_FLIGHT - "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) - - define_option(ARROW_FLIGHT_SQL "Build the Arrow Flight SQL extension" OFF) - - define_option(ARROW_GANDIVA "Build the Gandiva libraries" OFF) + "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" + OFF + DEPENDS + ARROW_IPC) + + define_option(ARROW_FLIGHT_SQL + "Build the Arrow Flight SQL extension" + OFF + DEPENDS + ARROW_FLIGHT) + + define_option(ARROW_GANDIVA + "Build the Gandiva libraries" + OFF + DEPENDS + ARROW_WITH_RE2 + ARROW_WITH_UTF8PROC) define_option(ARROW_GCS "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF) @@ -264,29 +353,66 @@ if(ARROW_DEFINE_OPTIONS) define_option(ARROW_JEMALLOC ${ARROW_JEMALLOC_DESCRIPTION} ON) endif() - define_option(ARROW_JNI "Build the Arrow JNI lib" OFF) - define_option(ARROW_JSON "Build Arrow with JSON support (requires RapidJSON)" OFF) define_option(ARROW_MIMALLOC "Build the Arrow mimalloc-based allocator" OFF) - define_option(ARROW_PARQUET "Build the Parquet libraries" OFF) - - define_option(ARROW_ORC "Build the Arrow ORC adapter" OFF) + define_option(ARROW_PARQUET + "Build the Parquet libraries" + OFF + DEPENDS + ARROW_COMPUTE + ARROW_IPC) + + define_option(ARROW_ORC + "Build the Arrow ORC adapter" + OFF + DEPENDS + ARROW_WITH_LZ4 + ARROW_WITH_SNAPPY + ARROW_WITH_ZLIB + ARROW_WITH_ZSTD) define_option(ARROW_PLASMA "Build the plasma object store along with Arrow" OFF) - define_option(ARROW_PLASMA_JAVA_CLIENT "Build the plasma object store java client" OFF) - - define_option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) + define_option(ARROW_PYTHON + "Build some components needed by PyArrow.;\ +(This is a deprecated option. Use CMake presets instead.)" + OFF + DEPENDS + ARROW_COMPUTE + ARROW_CSV + ARROW_DATASET + ARROW_FILESYSTEM + ARROW_HDFS + ARROW_JSON) define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) - define_option(ARROW_SKYHOOK "Build the Skyhook libraries" OFF) + define_option(ARROW_SKYHOOK + "Build the Skyhook libraries" + OFF + DEPENDS + ARROW_DATASET + ARROW_PARQUET + ARROW_WITH_LZ4 + ARROW_WITH_SNAPPY) + + define_option(ARROW_SUBSTRAIT + "Build the Arrow Substrait Consumer Module" + OFF + DEPENDS + ARROW_DATASET + ARROW_IPC + ARROW_PARQUET) define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF) - define_option(ARROW_TESTING "Build the Arrow testing libraries" OFF) + define_option(ARROW_TESTING + "Build the Arrow testing libraries" + OFF + DEPENDS + ARROW_JSON) #---------------------------------------------------------------------- set_option_category("Thirdparty toolchain") @@ -465,8 +591,6 @@ Always OFF if building binaries" OFF) #---------------------------------------------------------------------- set_option_category("Gandiva") - define_option(ARROW_GANDIVA_JAVA "Build the Gandiva JNI wrappers" OFF) - # ARROW-3860: Temporary workaround define_option(ARROW_GANDIVA_STATIC_LIBSTDCPP "Include -static-libstdc++ -static-libgcc when linking with;Gandiva static libraries" @@ -490,6 +614,8 @@ that have not been built" option(ARROW_BUILD_CONFIG_SUMMARY_JSON "Summarize build configuration in a JSON file" ON) + + resolve_option_dependencies() endif() macro(validate_config) @@ -555,6 +681,9 @@ macro(config_summary_message) endforeach() + if(ARROW_PYTHON) + message(WARNING "ARROW_PYTHON is deprecated. Use CMake presets instead.") + endif() endmacro() macro(config_summary_json) diff --git a/cpp/cmake_modules/FindAWSSDKAlt.cmake b/cpp/cmake_modules/FindAWSSDKAlt.cmake new file mode 100644 index 00000000000..611184aa1d1 --- /dev/null +++ b/cpp/cmake_modules/FindAWSSDKAlt.cmake @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(find_package_args) +if(AWSSDKAlt_FIND_VERSION) + list(APPEND find_package_args ${AWSSDKAlt_FIND_VERSION}) +endif() +if(AWSSDKAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +# See https://aws.amazon.com/blogs/developer/developer-experience-of-the-aws-sdk-for-c-now-simplified-by-cmake/ +# Workaround to force AWS CMake configuration to look for shared libraries +if(DEFINED ENV{CONDA_PREFIX}) + if(DEFINED BUILD_SHARED_LIBS) + set(BUILD_SHARED_LIBS_WAS_SET TRUE) + set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) + else() + set(BUILD_SHARED_LIBS_WAS_SET FALSE) + endif() + set(BUILD_SHARED_LIBS ON) +endif() +find_package(AWSSDK ${find_package_args} + COMPONENTS config + s3 + transfer + identity-management + sts) +# Restore previous value of BUILD_SHARED_LIBS +if(DEFINED ENV{CONDA_PREFIX}) + if(BUILD_SHARED_LIBS_WAS_SET) + set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) + else() + unset(BUILD_SHARED_LIBS) + endif() +endif() +set(AWSSDKAlt_FOUND ${AWSSDK_FOUND}) diff --git a/cpp/cmake_modules/FindArrow.cmake b/cpp/cmake_modules/FindArrow.cmake deleted file mode 100644 index 9d2faaf5819..00000000000 --- a/cpp/cmake_modules/FindArrow.cmake +++ /dev/null @@ -1,466 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow (arrow/api.h, libarrow.a, libarrow.so) -# This module defines -# ARROW_FOUND, whether Arrow has been found -# ARROW_FULL_SO_VERSION, full shared object version of found Arrow "100.0.0" -# ARROW_IMPORT_LIB, path to libarrow's import library (Windows only) -# ARROW_INCLUDE_DIR, directory containing headers -# ARROW_LIBS, deprecated. Use ARROW_LIB_DIR instead -# ARROW_LIB_DIR, directory containing Arrow libraries -# ARROW_SHARED_IMP_LIB, deprecated. Use ARROW_IMPORT_LIB instead -# ARROW_SHARED_LIB, path to libarrow's shared library -# ARROW_SO_VERSION, shared object version of found Arrow such as "100" -# ARROW_STATIC_LIB, path to libarrow.a -# ARROW_VERSION, version of found Arrow -# ARROW_VERSION_MAJOR, major version of found Arrow -# ARROW_VERSION_MINOR, minor version of found Arrow -# ARROW_VERSION_PATCH, patch version of found Arrow - -if(DEFINED ARROW_FOUND) - return() -endif() - -find_package(PkgConfig) -include(FindPackageHandleStandardArgs) - -if(WIN32 AND NOT MINGW) - # This is used to handle builds using e.g. clang in an MSVC setting. - set(MSVC_TOOLCHAIN TRUE) -else() - set(MSVC_TOOLCHAIN FALSE) -endif() - -set(ARROW_SEARCH_LIB_PATH_SUFFIXES) -if(CMAKE_LIBRARY_ARCHITECTURE) - list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}") -endif() -list(APPEND - ARROW_SEARCH_LIB_PATH_SUFFIXES - "lib64" - "lib32" - "lib" - "bin") -set(ARROW_CONFIG_SUFFIXES - "_RELEASE" - "_RELWITHDEBINFO" - "_MINSIZEREL" - "_DEBUG" - "") -if(CMAKE_BUILD_TYPE) - string(TOUPPER ${CMAKE_BUILD_TYPE} ARROW_CONFIG_SUFFIX_PREFERRED) - set(ARROW_CONFIG_SUFFIX_PREFERRED "_${ARROW_CONFIG_SUFFIX_PREFERRED}") - list(INSERT ARROW_CONFIG_SUFFIXES 0 "${ARROW_CONFIG_SUFFIX_PREFERRED}") -endif() - -if(NOT DEFINED ARROW_MSVC_STATIC_LIB_SUFFIX) - if(MSVC_TOOLCHAIN) - set(ARROW_MSVC_STATIC_LIB_SUFFIX "_static") - else() - set(ARROW_MSVC_STATIC_LIB_SUFFIX "") - endif() -endif() - -# Internal function. -# -# Set shared library name for ${base_name} to ${output_variable}. -# -# Example: -# arrow_build_shared_library_name(ARROW_SHARED_LIBRARY_NAME arrow) -# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.so on Linux -# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.dylib on macOS -# # -> ARROW_SHARED_LIBRARY_NAME=arrow.dll with MSVC on Windows -# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.dll with MinGW on Windows -function(arrow_build_shared_library_name output_variable base_name) - set(${output_variable} - "${CMAKE_SHARED_LIBRARY_PREFIX}${base_name}${CMAKE_SHARED_LIBRARY_SUFFIX}" - PARENT_SCOPE) -endfunction() - -# Internal function. -# -# Set import library name for ${base_name} to ${output_variable}. -# This is useful only for MSVC build. Import library is used only -# with MSVC build. -# -# Example: -# arrow_build_import_library_name(ARROW_IMPORT_LIBRARY_NAME arrow) -# # -> ARROW_IMPORT_LIBRARY_NAME=arrow on Linux (meaningless) -# # -> ARROW_IMPORT_LIBRARY_NAME=arrow on macOS (meaningless) -# # -> ARROW_IMPORT_LIBRARY_NAME=arrow.lib with MSVC on Windows -# # -> ARROW_IMPORT_LIBRARY_NAME=libarrow.dll.a with MinGW on Windows -function(arrow_build_import_library_name output_variable base_name) - set(${output_variable} - "${CMAKE_IMPORT_LIBRARY_PREFIX}${base_name}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - PARENT_SCOPE) -endfunction() - -# Internal function. -# -# Set static library name for ${base_name} to ${output_variable}. -# -# Example: -# arrow_build_static_library_name(ARROW_STATIC_LIBRARY_NAME arrow) -# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.a on Linux -# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.a on macOS -# # -> ARROW_STATIC_LIBRARY_NAME=arrow.lib with MSVC on Windows -# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.dll.a with MinGW on Windows -function(arrow_build_static_library_name output_variable base_name) - set(${output_variable} - "${CMAKE_STATIC_LIBRARY_PREFIX}${base_name}${ARROW_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) -endfunction() - -# Internal function. -# -# Set macro value for ${macro_name} in ${header_content} to ${output_variable}. -# -# Example: -# arrow_extract_macro_value(version_major -# "ARROW_VERSION_MAJOR" -# "#define ARROW_VERSION_MAJOR 1.0.0") -# # -> version_major=1.0.0 -function(arrow_extract_macro_value output_variable macro_name header_content) - string(REGEX MATCH "#define +${macro_name} +[^\r\n]+" macro_definition - "${header_content}") - string(REGEX REPLACE "^#define +${macro_name} +(.+)$" "\\1" macro_value - "${macro_definition}") - set(${output_variable} - "${macro_value}" - PARENT_SCOPE) -endfunction() - -# Internal macro only for arrow_find_package. -# -# Find package in HOME. -macro(arrow_find_package_home) - find_path(${prefix}_include_dir "${header_path}" - PATHS "${home}" - PATH_SUFFIXES "include" - NO_DEFAULT_PATH) - set(include_dir "${${prefix}_include_dir}") - set(${prefix}_INCLUDE_DIR - "${include_dir}" - PARENT_SCOPE) - - if(MSVC_TOOLCHAIN) - set(CMAKE_SHARED_LIBRARY_SUFFIXES_ORIGINAL ${CMAKE_FIND_LIBRARY_SUFFIXES}) - # .dll isn't found by find_library with MSVC because .dll isn't included in - # CMAKE_FIND_LIBRARY_SUFFIXES. - list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}") - endif() - find_library(${prefix}_shared_lib - NAMES "${shared_lib_name}" - PATHS "${home}" - PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} - NO_DEFAULT_PATH) - if(MSVC_TOOLCHAIN) - set(CMAKE_SHARED_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_ORIGINAL}) - endif() - set(shared_lib "${${prefix}_shared_lib}") - set(${prefix}_SHARED_LIB - "${shared_lib}" - PARENT_SCOPE) - if(shared_lib) - add_library(${target_shared} SHARED IMPORTED) - set_target_properties(${target_shared} PROPERTIES IMPORTED_LOCATION "${shared_lib}") - if(include_dir) - set_target_properties(${target_shared} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${include_dir}") - endif() - find_library(${prefix}_import_lib - NAMES "${import_lib_name}" - PATHS "${home}" - PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} - NO_DEFAULT_PATH) - set(import_lib "${${prefix}_import_lib}") - set(${prefix}_IMPORT_LIB - "${import_lib}" - PARENT_SCOPE) - if(import_lib) - set_target_properties(${target_shared} PROPERTIES IMPORTED_IMPLIB "${import_lib}") - endif() - endif() - - find_library(${prefix}_static_lib - NAMES "${static_lib_name}" - PATHS "${home}" - PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} - NO_DEFAULT_PATH) - set(static_lib "${${prefix}_static_lib}") - set(${prefix}_STATIC_LIB - "${static_lib}" - PARENT_SCOPE) - if(static_lib) - add_library(${target_static} STATIC IMPORTED) - set_target_properties(${target_static} PROPERTIES IMPORTED_LOCATION "${static_lib}") - if(include_dir) - set_target_properties(${target_static} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${include_dir}") - endif() - endif() -endmacro() - -# Internal macro only for arrow_find_package. -# -# Find package by CMake package configuration. -macro(arrow_find_package_cmake_package_configuration) - find_package(${cmake_package_name} CONFIG) - if(${cmake_package_name}_FOUND) - set(${prefix}_USE_CMAKE_PACKAGE_CONFIG - TRUE - PARENT_SCOPE) - if(TARGET ${target_shared}) - foreach(suffix ${ARROW_CONFIG_SUFFIXES}) - get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION${suffix}) - if(shared_lib) - # Remove shared library version: - # libarrow.so.100.0.0 -> libarrow.so - # Because ARROW_HOME and pkg-config approaches don't add - # shared library version. - string(REGEX REPLACE "(${CMAKE_SHARED_LIBRARY_SUFFIX})[.0-9]+$" "\\1" - shared_lib "${shared_lib}") - set(${prefix}_SHARED_LIB - "${shared_lib}" - PARENT_SCOPE) - break() - endif() - endforeach() - endif() - if(TARGET ${target_static}) - foreach(suffix ${ARROW_CONFIG_SUFFIXES}) - get_target_property(static_lib ${target_static} IMPORTED_LOCATION${suffix}) - if(static_lib) - set(${prefix}_STATIC_LIB - "${static_lib}" - PARENT_SCOPE) - break() - endif() - endforeach() - endif() - endif() -endmacro() - -# Internal macro only for arrow_find_package. -# -# Find package by pkg-config. -macro(arrow_find_package_pkg_config) - pkg_check_modules(${prefix}_PC ${pkg_config_name}) - if(${prefix}_PC_FOUND) - set(${prefix}_USE_PKG_CONFIG - TRUE - PARENT_SCOPE) - - set(include_dir "${${prefix}_PC_INCLUDEDIR}") - set(lib_dir "${${prefix}_PC_LIBDIR}") - set(shared_lib_paths "${${prefix}_PC_LINK_LIBRARIES}") - # Use the first shared library path as the IMPORTED_LOCATION - # for ${target_shared}. This assumes that the first shared library - # path is the shared library path for this module. - list(GET shared_lib_paths 0 first_shared_lib_path) - # Use the rest shared library paths as the INTERFACE_LINK_LIBRARIES - # for ${target_shared}. This assumes that the rest shared library - # paths are dependency library paths for this module. - list(LENGTH shared_lib_paths n_shared_lib_paths) - if(n_shared_lib_paths LESS_EQUAL 1) - set(rest_shared_lib_paths) - else() - list(SUBLIST - shared_lib_paths - 1 - -1 - rest_shared_lib_paths) - endif() - - set(${prefix}_VERSION - "${${prefix}_PC_VERSION}" - PARENT_SCOPE) - set(${prefix}_INCLUDE_DIR - "${include_dir}" - PARENT_SCOPE) - set(${prefix}_SHARED_LIB - "${first_shared_lib_path}" - PARENT_SCOPE) - - add_library(${target_shared} SHARED IMPORTED) - set_target_properties(${target_shared} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}" - INTERFACE_LINK_LIBRARIES "${rest_shared_lib_paths}" - IMPORTED_LOCATION "${first_shared_lib_path}") - get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION) - - find_library(${prefix}_static_lib - NAMES "${static_lib_name}" - PATHS "${lib_dir}" - NO_DEFAULT_PATH) - set(static_lib "${${prefix}_static_lib}") - set(${prefix}_STATIC_LIB - "${static_lib}" - PARENT_SCOPE) - if(static_lib) - add_library(${target_static} STATIC IMPORTED) - set_target_properties(${target_static} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}" - IMPORTED_LOCATION "${static_lib}") - endif() - endif() -endmacro() - -function(arrow_find_package - prefix - home - base_name - header_path - cmake_package_name - pkg_config_name) - arrow_build_shared_library_name(shared_lib_name ${base_name}) - arrow_build_import_library_name(import_lib_name ${base_name}) - arrow_build_static_library_name(static_lib_name ${base_name}) - - set(target_shared ${base_name}_shared) - set(target_static ${base_name}_static) - - if(home) - arrow_find_package_home() - set(${prefix}_FIND_APPROACH - "HOME: ${home}" - PARENT_SCOPE) - else() - arrow_find_package_cmake_package_configuration() - if(${cmake_package_name}_FOUND) - set(${prefix}_FIND_APPROACH - "CMake package configuration: ${cmake_package_name}" - PARENT_SCOPE) - else() - arrow_find_package_pkg_config() - set(${prefix}_FIND_APPROACH - "pkg-config: ${pkg_config_name}" - PARENT_SCOPE) - endif() - endif() - - if(NOT include_dir) - if(TARGET ${target_shared}) - get_target_property(include_dir ${target_shared} INTERFACE_INCLUDE_DIRECTORIES) - elseif(TARGET ${target_static}) - get_target_property(include_dir ${target_static} INTERFACE_INCLUDE_DIRECTORIES) - endif() - endif() - if(include_dir) - set(${prefix}_INCLUDE_DIR - "${include_dir}" - PARENT_SCOPE) - endif() - - if(shared_lib) - get_filename_component(lib_dir "${shared_lib}" DIRECTORY) - elseif(static_lib) - get_filename_component(lib_dir "${static_lib}" DIRECTORY) - else() - set(lib_dir NOTFOUND) - endif() - set(${prefix}_LIB_DIR - "${lib_dir}" - PARENT_SCOPE) - # For backward compatibility - set(${prefix}_LIBS - "${lib_dir}" - PARENT_SCOPE) -endfunction() - -if(NOT "$ENV{ARROW_HOME}" STREQUAL "") - file(TO_CMAKE_PATH "$ENV{ARROW_HOME}" ARROW_HOME) -endif() -arrow_find_package(ARROW - "${ARROW_HOME}" - arrow - arrow/api.h - Arrow - arrow) - -if(ARROW_HOME) - if(ARROW_INCLUDE_DIR) - file(READ "${ARROW_INCLUDE_DIR}/arrow/util/config.h" ARROW_CONFIG_H_CONTENT) - arrow_extract_macro_value(ARROW_VERSION_MAJOR "ARROW_VERSION_MAJOR" - "${ARROW_CONFIG_H_CONTENT}") - arrow_extract_macro_value(ARROW_VERSION_MINOR "ARROW_VERSION_MINOR" - "${ARROW_CONFIG_H_CONTENT}") - arrow_extract_macro_value(ARROW_VERSION_PATCH "ARROW_VERSION_PATCH" - "${ARROW_CONFIG_H_CONTENT}") - if("${ARROW_VERSION_MAJOR}" STREQUAL "" - OR "${ARROW_VERSION_MINOR}" STREQUAL "" - OR "${ARROW_VERSION_PATCH}" STREQUAL "") - set(ARROW_VERSION "0.0.0") - else() - set(ARROW_VERSION - "${ARROW_VERSION_MAJOR}.${ARROW_VERSION_MINOR}.${ARROW_VERSION_PATCH}") - endif() - - arrow_extract_macro_value(ARROW_SO_VERSION_QUOTED "ARROW_SO_VERSION" - "${ARROW_CONFIG_H_CONTENT}") - string(REGEX REPLACE "^\"(.+)\"$" "\\1" ARROW_SO_VERSION "${ARROW_SO_VERSION_QUOTED}") - arrow_extract_macro_value(ARROW_FULL_SO_VERSION_QUOTED "ARROW_FULL_SO_VERSION" - "${ARROW_CONFIG_H_CONTENT}") - string(REGEX REPLACE "^\"(.+)\"$" "\\1" ARROW_FULL_SO_VERSION - "${ARROW_FULL_SO_VERSION_QUOTED}") - endif() -else() - if(ARROW_USE_CMAKE_PACKAGE_CONFIG) - find_package(Arrow CONFIG) - elseif(ARROW_USE_PKG_CONFIG) - pkg_get_variable(ARROW_SO_VERSION arrow so_version) - pkg_get_variable(ARROW_FULL_SO_VERSION arrow full_so_version) - endif() -endif() - -set(ARROW_ABI_VERSION ${ARROW_SO_VERSION}) - -mark_as_advanced(ARROW_ABI_VERSION - ARROW_CONFIG_SUFFIXES - ARROW_FULL_SO_VERSION - ARROW_IMPORT_LIB - ARROW_INCLUDE_DIR - ARROW_LIBS - ARROW_LIB_DIR - ARROW_SEARCH_LIB_PATH_SUFFIXES - ARROW_SHARED_IMP_LIB - ARROW_SHARED_LIB - ARROW_SO_VERSION - ARROW_STATIC_LIB - ARROW_VERSION - ARROW_VERSION_MAJOR - ARROW_VERSION_MINOR - ARROW_VERSION_PATCH) - -find_package_handle_standard_args( - Arrow - REQUIRED_VARS # The first required variable is shown - # in the found message. So this list is - # not sorted alphabetically. - ARROW_INCLUDE_DIR ARROW_LIB_DIR ARROW_FULL_SO_VERSION ARROW_SO_VERSION - VERSION_VAR ARROW_VERSION) -set(ARROW_FOUND ${Arrow_FOUND}) - -if(Arrow_FOUND AND NOT Arrow_FIND_QUIETLY) - message(STATUS "Arrow version: ${ARROW_VERSION} (${ARROW_FIND_APPROACH})") - message(STATUS "Arrow SO and ABI version: ${ARROW_SO_VERSION}") - message(STATUS "Arrow full SO version: ${ARROW_FULL_SO_VERSION}") - message(STATUS "Found the Arrow core shared library: ${ARROW_SHARED_LIB}") - message(STATUS "Found the Arrow core import library: ${ARROW_IMPORT_LIB}") - message(STATUS "Found the Arrow core static library: ${ARROW_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindArrowCUDA.cmake b/cpp/cmake_modules/FindArrowCUDA.cmake deleted file mode 100644 index 014386f3012..00000000000 --- a/cpp/cmake_modules/FindArrowCUDA.cmake +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow CUDA (arrow/gpu/cuda_api.h, libarrow_cuda.a, libarrow_cuda.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_CUDA_FOUND, whether Arrow CUDA has been found -# ARROW_CUDA_IMPORT_LIB, path to libarrow_cuda's import library (Windows only) -# ARROW_CUDA_INCLUDE_DIR, directory containing headers -# ARROW_CUDA_LIBS, deprecated. Use ARROW_CUDA_LIB_DIR instead -# ARROW_CUDA_LIB_DIR, directory containing Arrow CUDA libraries -# ARROW_CUDA_SHARED_IMP_LIB, deprecated. Use ARROW_CUDA_IMPORT_LIB instead -# ARROW_CUDA_SHARED_LIB, path to libarrow_cuda's shared library -# ARROW_CUDA_STATIC_LIB, path to libarrow_cuda.a - -if(DEFINED ARROW_CUDA_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_CUDA - "${ARROW_HOME}" - arrow_cuda - arrow/gpu/cuda_api.h - ArrowCUDA - arrow-cuda) - if(NOT ARROW_CUDA_VERSION) - set(ARROW_CUDA_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_CUDA_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_CUDA_VERSION_MATCH TRUE) -else() - set(ARROW_CUDA_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_CUDA_IMPORT_LIB - ARROW_CUDA_INCLUDE_DIR - ARROW_CUDA_LIBS - ARROW_CUDA_LIB_DIR - ARROW_CUDA_SHARED_IMP_LIB - ARROW_CUDA_SHARED_LIB - ARROW_CUDA_STATIC_LIB - ARROW_CUDA_VERSION - ARROW_CUDA_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowCUDA - REQUIRED_VARS ARROW_CUDA_INCLUDE_DIR ARROW_CUDA_LIB_DIR ARROW_CUDA_VERSION_MATCH - VERSION_VAR ARROW_CUDA_VERSION) -set(ARROW_CUDA_FOUND ${ArrowCUDA_FOUND}) - -if(ArrowCUDA_FOUND AND NOT ArrowCUDA_FIND_QUIETLY) - message(STATUS "Found the Arrow CUDA by ${ARROW_CUDA_FIND_APPROACH}") - message(STATUS "Found the Arrow CUDA shared library: ${ARROW_CUDA_SHARED_LIB}") - message(STATUS "Found the Arrow CUDA import library: ${ARROW_CUDA_IMPORT_LIB}") - message(STATUS "Found the Arrow CUDA static library: ${ARROW_CUDA_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindArrowDataset.cmake b/cpp/cmake_modules/FindArrowDataset.cmake deleted file mode 100644 index e6ecc1b43ba..00000000000 --- a/cpp/cmake_modules/FindArrowDataset.cmake +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Dataset (arrow/dataset/api.h, libarrow_dataset.a, libarrow_dataset.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_DATASET_FOUND, whether Arrow Dataset has been found -# ARROW_DATASET_IMPORT_LIB, -# path to libarrow_dataset's import library (Windows only) -# ARROW_DATASET_INCLUDE_DIR, directory containing headers -# ARROW_DATASET_LIB_DIR, directory containing Arrow Dataset libraries -# ARROW_DATASET_SHARED_LIB, path to libarrow_dataset's shared library -# ARROW_DATASET_STATIC_LIB, path to libarrow_dataset.a - -if(DEFINED ARROW_DATASET_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_DATASET - "${ARROW_HOME}" - arrow_dataset - arrow/dataset/api.h - ArrowDataset - arrow-dataset) - if(NOT ARROW_DATASET_VERSION) - set(ARROW_DATASET_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_DATASET_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_DATASET_VERSION_MATCH TRUE) -else() - set(ARROW_DATASET_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_DATASET_IMPORT_LIB - ARROW_DATASET_INCLUDE_DIR - ARROW_DATASET_LIBS - ARROW_DATASET_LIB_DIR - ARROW_DATASET_SHARED_IMP_LIB - ARROW_DATASET_SHARED_LIB - ARROW_DATASET_STATIC_LIB - ARROW_DATASET_VERSION - ARROW_DATASET_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowDataset - REQUIRED_VARS ARROW_DATASET_INCLUDE_DIR ARROW_DATASET_LIB_DIR - ARROW_DATASET_VERSION_MATCH - VERSION_VAR ARROW_DATASET_VERSION) -set(ARROW_DATASET_FOUND ${ArrowDataset_FOUND}) - -if(ArrowDataset_FOUND AND NOT ArrowDataset_FIND_QUIETLY) - message(STATUS "Found the Arrow Dataset by ${ARROW_DATASET_FIND_APPROACH}") - message(STATUS "Found the Arrow Dataset shared library: ${ARROW_DATASET_SHARED_LIB}") - message(STATUS "Found the Arrow Dataset import library: ${ARROW_DATASET_IMPORT_LIB}") - message(STATUS "Found the Arrow Dataset static library: ${ARROW_DATASET_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindArrowFlight.cmake b/cpp/cmake_modules/FindArrowFlight.cmake deleted file mode 100644 index 805a4ff3803..00000000000 --- a/cpp/cmake_modules/FindArrowFlight.cmake +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Flight (arrow/flight/api.h, libarrow_flight.a, libarrow_flight.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_FLIGHT_FOUND, whether Flight has been found -# ARROW_FLIGHT_IMPORT_LIB, -# path to libarrow_flight's import library (Windows only) -# ARROW_FLIGHT_INCLUDE_DIR, directory containing headers -# ARROW_FLIGHT_LIBS, deprecated. Use ARROW_FLIGHT_LIB_DIR instead -# ARROW_FLIGHT_LIB_DIR, directory containing Flight libraries -# ARROW_FLIGHT_SHARED_IMP_LIB, deprecated. Use ARROW_FLIGHT_IMPORT_LIB instead -# ARROW_FLIGHT_SHARED_LIB, path to libarrow_flight's shared library -# ARROW_FLIGHT_STATIC_LIB, path to libarrow_flight.a - -if(DEFINED ARROW_FLIGHT_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_FLIGHT - "${ARROW_HOME}" - arrow_flight - arrow/flight/api.h - ArrowFlight - arrow-flight) - if(NOT ARROW_FLIGHT_VERSION) - set(ARROW_FLIGHT_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_FLIGHT_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_FLIGHT_VERSION_MATCH TRUE) -else() - set(ARROW_FLIGHT_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_FLIGHT_IMPORT_LIB - ARROW_FLIGHT_INCLUDE_DIR - ARROW_FLIGHT_LIBS - ARROW_FLIGHT_LIB_DIR - ARROW_FLIGHT_SHARED_IMP_LIB - ARROW_FLIGHT_SHARED_LIB - ARROW_FLIGHT_STATIC_LIB - ARROW_FLIGHT_VERSION - ARROW_FLIGHT_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowFlight - REQUIRED_VARS ARROW_FLIGHT_INCLUDE_DIR ARROW_FLIGHT_LIB_DIR ARROW_FLIGHT_VERSION_MATCH - VERSION_VAR ARROW_FLIGHT_VERSION) -set(ARROW_FLIGHT_FOUND ${ArrowFlight_FOUND}) - -if(ArrowFlight_FOUND AND NOT ArrowFlight_FIND_QUIETLY) - message(STATUS "Found the Arrow Flight by ${ARROW_FLIGHT_FIND_APPROACH}") - message(STATUS "Found the Arrow Flight shared library: ${ARROW_FLIGHT_SHARED_LIB}") - message(STATUS "Found the Arrow Flight import library: ${ARROW_FLIGHT_IMPORT_LIB}") - message(STATUS "Found the Arrow Flight static library: ${ARROW_FLIGHT_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindArrowFlightSql.cmake b/cpp/cmake_modules/FindArrowFlightSql.cmake deleted file mode 100644 index cbca81cac44..00000000000 --- a/cpp/cmake_modules/FindArrowFlightSql.cmake +++ /dev/null @@ -1,93 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Flight SQL -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_FLIGHT_SQL_FOUND, whether Flight has been found -# ARROW_FLIGHT_SQL_IMPORT_LIB, -# path to libarrow_flight's import library (Windows only) -# ARROW_FLIGHT_SQL_INCLUDE_DIR, directory containing headers -# ARROW_FLIGHT_SQL_LIBS, deprecated. Use ARROW_FLIGHT_SQL_LIB_DIR instead -# ARROW_FLIGHT_SQL_LIB_DIR, directory containing Flight libraries -# ARROW_FLIGHT_SQL_SHARED_IMP_LIB, deprecated. Use ARROW_FLIGHT_SQL_IMPORT_LIB instead -# ARROW_FLIGHT_SQL_SHARED_LIB, path to libarrow_flight's shared library -# ARROW_FLIGHT_SQL_STATIC_LIB, path to libarrow_flight.a - -if(DEFINED ARROW_FLIGHT_SQL_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_FLIGHT_SQL - "${ARROW_HOME}" - arrow_flight_sql - arrow/flight/sql/api.h - ArrowFlightSql - arrow-flight-sql) - if(NOT ARROW_FLIGHT_SQL_VERSION) - set(ARROW_FLIGHT_SQL_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_FLIGHT_SQL_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_FLIGHT_SQL_VERSION_MATCH TRUE) -else() - set(ARROW_FLIGHT_SQL_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_FLIGHT_SQL_IMPORT_LIB - ARROW_FLIGHT_SQL_INCLUDE_DIR - ARROW_FLIGHT_SQL_LIBS - ARROW_FLIGHT_SQL_LIB_DIR - ARROW_FLIGHT_SQL_SHARED_IMP_LIB - ARROW_FLIGHT_SQL_SHARED_LIB - ARROW_FLIGHT_SQL_STATIC_LIB - ARROW_FLIGHT_SQL_VERSION - ARROW_FLIGHT_SQL_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowFlightSql - REQUIRED_VARS ARROW_FLIGHT_SQL_INCLUDE_DIR ARROW_FLIGHT_SQL_LIB_DIR - ARROW_FLIGHT_SQL_VERSION_MATCH - VERSION_VAR ARROW_FLIGHT_SQL_VERSION) -set(ARROW_FLIGHT_SQL_FOUND ${ArrowFlightSql_FOUND}) - -if(ArrowFlightSql_FOUND AND NOT ArrowFlightSql_FIND_QUIETLY) - message(STATUS "Found the Arrow Flight SQL by ${ARROW_FLIGHT_SQL_FIND_APPROACH}") - message(STATUS "Found the Arrow Flight SQL shared library: ${ARROW_FLIGHT_SQL_SHARED_LIB}" - ) - message(STATUS "Found the Arrow Flight SQL import library: ${ARROW_FLIGHT_SQL_IMPORT_LIB}" - ) - message(STATUS "Found the Arrow Flight SQL static library: ${ARROW_FLIGHT_SQL_STATIC_LIB}" - ) -endif() diff --git a/cpp/cmake_modules/FindArrowFlightTesting.cmake b/cpp/cmake_modules/FindArrowFlightTesting.cmake deleted file mode 100644 index c0756cf637c..00000000000 --- a/cpp/cmake_modules/FindArrowFlightTesting.cmake +++ /dev/null @@ -1,98 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Flight testing library -# (arrow/flight/test_util.h, -# libarrow_flight_testing.a, -# libarrow_flight_testing.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_FLIGHT_TESTING_FOUND, -# whether Arrow Flight testing library has been found -# ARROW_FLIGHT_TESTING_IMPORT_LIB, -# path to libarrow_flight_testing's import library (Windows only) -# ARROW_FLIGHT_TESTING_INCLUDE_DIR, directory containing headers -# ARROW_FLIGHT_TESTING_LIB_DIR, directory containing Arrow testing libraries -# ARROW_FLIGHT_TESTING_SHARED_LIB, -# path to libarrow_flight_testing's shared library -# ARROW_FLIGHT_TESTING_STATIC_LIB, path to libarrow_flight_testing.a - -if(DEFINED ARROW_FLIGHT_TESTING_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(ArrowFlight ${find_package_arguments}) -find_package(ArrowTesting ${find_package_arguments}) - -if(ARROW_TESTING_FOUND AND ARROW_FLIGHT_FOUND) - arrow_find_package(ARROW_FLIGHT_TESTING - "${ARROW_HOME}" - arrow_flight_testing - arrow/flight/test_util.h - ArrowFlightTesting - arrow-flight-testing) - if(NOT ARROW_FLIGHT_TESTING_VERSION) - set(ARROW_FLIGHT_TESTING_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_FLIGHT_TESTING_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_FLIGHT_TESTING_VERSION_MATCH TRUE) -else() - set(ARROW_FLIGHT_TESTING_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_FLIGHT_TESTING_IMPORT_LIB - ARROW_FLIGHT_TESTING_INCLUDE_DIR - ARROW_FLIGHT_TESTING_LIBS - ARROW_FLIGHT_TESTING_LIB_DIR - ARROW_FLIGHT_TESTING_SHARED_IMP_LIB - ARROW_FLIGHT_TESTING_SHARED_LIB - ARROW_FLIGHT_TESTING_STATIC_LIB - ARROW_FLIGHT_TESTING_VERSION - ARROW_FLIGHT_TESTING_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowFlightTesting - REQUIRED_VARS ARROW_FLIGHT_TESTING_INCLUDE_DIR ARROW_FLIGHT_TESTING_LIB_DIR - ARROW_FLIGHT_TESTING_VERSION_MATCH - VERSION_VAR ARROW_FLIGHT_TESTING_VERSION) -set(ARROW_FLIGHT_TESTING_FOUND ${ArrowFlightTesting_FOUND}) - -if(ArrowFlightTesting_FOUND AND NOT ArrowFlightTesting_FIND_QUIETLY) - message(STATUS "Found the Arrow Flight testing by ${ARROW_FLIGHT_TESTING_FIND_APPROACH}" - ) - message(STATUS "Found the Arrow Flight testing shared library: ${ARROW_FLIGHT_TESTING_SHARED_LIB}" - ) - message(STATUS "Found the Arrow Flight testing import library: ${ARROW_FLIGHT_TESTING_IMPORT_LIB}" - ) - message(STATUS "Found the Arrow Flight testing static library: ${ARROW_FLIGHT_TESTING_STATIC_LIB}" - ) -endif() diff --git a/cpp/cmake_modules/FindArrowPython.cmake b/cpp/cmake_modules/FindArrowPython.cmake deleted file mode 100644 index b503e6a9e02..00000000000 --- a/cpp/cmake_modules/FindArrowPython.cmake +++ /dev/null @@ -1,87 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Python (arrow/python/api.h, libarrow_python.a, libarrow_python.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_PYTHON_FOUND, whether Arrow Python has been found -# ARROW_PYTHON_IMPORT_LIB, -# path to libarrow_python's import library (Windows only) -# ARROW_PYTHON_INCLUDE_DIR, directory containing headers -# ARROW_PYTHON_LIB_DIR, directory containing Arrow Python libraries -# ARROW_PYTHON_SHARED_LIB, path to libarrow_python's shared library -# ARROW_PYTHON_STATIC_LIB, path to libarrow_python.a - -if(DEFINED ARROW_PYTHON_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_PYTHON - "${ARROW_HOME}" - arrow_python - arrow/python/api.h - ArrowPython - arrow-python) - if(NOT ARROW_PYTHON_VERSION) - set(ARROW_PYTHON_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_PYTHON_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_PYTHON_VERSION_MATCH TRUE) -else() - set(ARROW_PYTHON_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_PYTHON_IMPORT_LIB - ARROW_PYTHON_INCLUDE_DIR - ARROW_PYTHON_LIBS - ARROW_PYTHON_LIB_DIR - ARROW_PYTHON_SHARED_IMP_LIB - ARROW_PYTHON_SHARED_LIB - ARROW_PYTHON_STATIC_LIB - ARROW_PYTHON_VERSION - ARROW_PYTHON_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowPython - REQUIRED_VARS ARROW_PYTHON_INCLUDE_DIR ARROW_PYTHON_LIB_DIR ARROW_PYTHON_VERSION_MATCH - VERSION_VAR ARROW_PYTHON_VERSION) -set(ARROW_PYTHON_FOUND ${ArrowPython_FOUND}) - -if(ArrowPython_FOUND AND NOT ArrowPython_FIND_QUIETLY) - message(STATUS "Found the Arrow Python by ${ARROW_PYTHON_FIND_APPROACH}") - message(STATUS "Found the Arrow Python shared library: ${ARROW_PYTHON_SHARED_LIB}") - message(STATUS "Found the Arrow Python import library: ${ARROW_PYTHON_IMPORT_LIB}") - message(STATUS "Found the Arrow Python static library: ${ARROW_PYTHON_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindArrowPythonFlight.cmake b/cpp/cmake_modules/FindArrowPythonFlight.cmake deleted file mode 100644 index 3a639928ce5..00000000000 --- a/cpp/cmake_modules/FindArrowPythonFlight.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Python Flight -# (arrow/python/flight.h, libarrow_python_flight.a, libarrow_python_flight.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_PYTHON_FLIGHT_FOUND, whether Arrow Python Flight has been found -# ARROW_PYTHON_FLIGHT_IMPORT_LIB, -# path to libarrow_python_flight's import library (Windows only) -# ARROW_PYTHON_FLIGHT_INCLUDE_DIR, directory containing headers -# ARROW_PYTHON_FLIGHT_LIB_DIR, -# directory containing Arrow Python Flight libraries -# ARROW_PYTHON_FLIGHT_SHARED_LIB, path to libarrow_python_flight's shared library -# ARROW_PYTHON_FLIGHT_STATIC_LIB, path to libarrow_python_flight.a - -if(DEFINED ARROW_PYTHON_FLIGHT_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(ArrowFlight ${find_package_arguments}) -find_package(ArrowPython ${find_package_arguments}) - -if(ARROW_PYTHON_FOUND AND ARROW_FLIGHT_FOUND) - arrow_find_package(ARROW_PYTHON_FLIGHT - "${ARROW_HOME}" - arrow_python_flight - arrow/python/flight.h - ArrowPythonFlight - arrow-python-flight) - if(NOT ARROW_PYTHON_FLIGHT_VERSION) - set(ARROW_PYTHON_FLIGHT_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_PYTHON_FLIGHT_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_PYTHON_FLIGHT_VERSION_MATCH TRUE) -else() - set(ARROW_PYTHON_FLIGHT_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_PYTHON_FLIGHT_IMPORT_LIB - ARROW_PYTHON_FLIGHT_INCLUDE_DIR - ARROW_PYTHON_FLIGHT_LIBS - ARROW_PYTHON_FLIGHT_LIB_DIR - ARROW_PYTHON_FLIGHT_SHARED_IMP_LIB - ARROW_PYTHON_FLIGHT_SHARED_LIB - ARROW_PYTHON_FLIGHT_STATIC_LIB - ARROW_PYTHON_FLIGHT_VERSION - ARROW_PYTHON_FLIGHT_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowPythonFlight - REQUIRED_VARS ARROW_PYTHON_FLIGHT_INCLUDE_DIR ARROW_PYTHON_FLIGHT_LIB_DIR - ARROW_PYTHON_FLIGHT_VERSION_MATCH - VERSION_VAR ARROW_PYTHON_FLIGHT_VERSION) -set(ARROW_PYTHON_FLIGHT_FOUND ${ArrowPythonFlight_FOUND}) - -if(ArrowPythonFlight_FOUND AND NOT ArrowPythonFlight_FIND_QUIETLY) - message(STATUS "Found the Arrow Python Flight by ${ARROW_PYTHON_FLIGHT_FIND_APPROACH}") - message(STATUS "Found the Arrow Python Flight shared library: ${ARROW_PYTHON_FLIGHT_SHARED_LIB}" - ) - message(STATUS "Found the Arrow Python Flight import library: ${ARROW_PYTHON_FLIGHT_IMPORT_LIB}" - ) - message(STATUS "Found the Arrow Python Flight static library: ${ARROW_PYTHON_FLIGHT_STATIC_LIB}" - ) -endif() diff --git a/cpp/cmake_modules/FindArrowSubstrait.cmake b/cpp/cmake_modules/FindArrowSubstrait.cmake deleted file mode 100644 index 165a05a0cb8..00000000000 --- a/cpp/cmake_modules/FindArrowSubstrait.cmake +++ /dev/null @@ -1,92 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow Substrait (libarrow_substrait.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_SUBSTRAIT_FOUND, whether Arrow Substrait has been found -# ARROW_SUBSTRAIT_IMPORT_LIB, -# path to libarrow_substrait's import library (Windows only) -# ARROW_SUBSTRAIT_INCLUDE_DIR, directory containing headers -# ARROW_SUBSTRAIT_LIB_DIR, directory containing Arrow Substrait libraries -# ARROW_SUBSTRAIT_SHARED_LIB, path to libarrow_substrait's shared library -# ARROW_SUBSTRAIT_STATIC_LIB, path to libarrow_substrait.a - -if(DEFINED ARROW_SUBSTRAIT_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) -find_package(Parquet ${find_package_arguments}) - -if(ARROW_FOUND AND PARQUET_FOUND) - arrow_find_package(ARROW_SUBSTRAIT - "${ARROW_HOME}" - arrow_substrait - arrow/engine/substrait/api.h - ArrowSubstrait - arrow-substrait) - if(NOT ARROW_SUBSTRAIT_VERSION) - set(ARROW_SUBSTRAIT_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_SUBSTRAIT_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_SUBSTRAIT_VERSION_MATCH TRUE) -else() - set(ARROW_SUBSTRAIT_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_SUBSTRAIT_IMPORT_LIB - ARROW_SUBSTRAIT_INCLUDE_DIR - ARROW_SUBSTRAIT_LIBS - ARROW_SUBSTRAIT_LIB_DIR - ARROW_SUBSTRAIT_SHARED_IMP_LIB - ARROW_SUBSTRAIT_SHARED_LIB - ARROW_SUBSTRAIT_STATIC_LIB - ARROW_SUBSTRAIT_VERSION - ARROW_SUBSTRAIT_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowSubstrait - REQUIRED_VARS ARROW_SUBSTRAIT_INCLUDE_DIR ARROW_SUBSTRAIT_LIB_DIR - ARROW_SUBSTRAIT_VERSION_MATCH - VERSION_VAR ARROW_SUBSTRAIT_VERSION) -set(ARROW_SUBSTRAIT_FOUND ${ArrowSubstrait_FOUND}) - -if(ArrowSubstrait_FOUND AND NOT ArrowSubstrait_FIND_QUIETLY) - message(STATUS "Found the Arrow Substrait by ${ARROW_SUBSTRAIT_FIND_APPROACH}") - message(STATUS "Found the Arrow Substrait shared library: ${ARROW_SUBSTRAIT_SHARED_LIB}" - ) - message(STATUS "Found the Arrow Substrait import library: ${ARROW_SUBSTRAIT_IMPORT_LIB}" - ) - message(STATUS "Found the Arrow Substrait static library: ${ARROW_SUBSTRAIT_STATIC_LIB}" - ) -endif() diff --git a/cpp/cmake_modules/FindArrowTesting.cmake b/cpp/cmake_modules/FindArrowTesting.cmake deleted file mode 100644 index c405003ad70..00000000000 --- a/cpp/cmake_modules/FindArrowTesting.cmake +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Arrow testing library -# (arrow/testing/util.h, libarrow_testing.a, libarrow_testing.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# ARROW_TESTING_FOUND, whether Arrow testing library has been found -# ARROW_TESTING_IMPORT_LIB, -# path to libarrow_testing's import library (Windows only) -# ARROW_TESTING_INCLUDE_DIR, directory containing headers -# ARROW_TESTING_LIB_DIR, directory containing Arrow testing libraries -# ARROW_TESTING_SHARED_LIB, path to libarrow_testing's shared library -# ARROW_TESTING_STATIC_LIB, path to libarrow_testing.a - -if(DEFINED ARROW_TESTING_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(ARROW_TESTING - "${ARROW_HOME}" - arrow_testing - arrow/testing/util.h - ArrowTesting - arrow-testing) - if(NOT ARROW_TESTING_VERSION) - set(ARROW_TESTING_VERSION "${ARROW_VERSION}") - endif() -endif() - -if("${ARROW_TESTING_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(ARROW_TESTING_VERSION_MATCH TRUE) -else() - set(ARROW_TESTING_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(ARROW_TESTING_IMPORT_LIB - ARROW_TESTING_INCLUDE_DIR - ARROW_TESTING_LIBS - ARROW_TESTING_LIB_DIR - ARROW_TESTING_SHARED_IMP_LIB - ARROW_TESTING_SHARED_LIB - ARROW_TESTING_STATIC_LIB - ARROW_TESTING_VERSION - ARROW_TESTING_VERSION_MATCH) - -find_package_handle_standard_args( - ArrowTesting - REQUIRED_VARS ARROW_TESTING_INCLUDE_DIR ARROW_TESTING_LIB_DIR - ARROW_TESTING_VERSION_MATCH - VERSION_VAR ARROW_TESTING_VERSION) -set(ARROW_TESTING_FOUND ${ArrowTesting_FOUND}) - -if(ArrowTesting_FOUND AND NOT ArrowTesting_FIND_QUIETLY) - message(STATUS "Found the Arrow testing by ${ARROW_TESTING_FIND_APPROACH}") - message(STATUS "Found the Arrow testing shared library: ${ARROW_TESTING_SHARED_LIB}") - message(STATUS "Found the Arrow testing import library: ${ARROW_TESTING_IMPORT_LIB}") - message(STATUS "Found the Arrow testing static library: ${ARROW_TESTING_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindBrotli.cmake b/cpp/cmake_modules/FindBrotliAlt.cmake similarity index 79% rename from cpp/cmake_modules/FindBrotli.cmake rename to cpp/cmake_modules/FindBrotliAlt.cmake index e2670b51a9e..3c90329be96 100644 --- a/cpp/cmake_modules/FindBrotli.cmake +++ b/cpp/cmake_modules/FindBrotliAlt.cmake @@ -15,7 +15,42 @@ # # Usage of this module as follows: # -# find_package(Brotli) +# find_package(BrotliAlt) + +if(BrotliAlt_FOUND) + return() +endif() + +if(ARROW_PACKAGE_KIND STREQUAL "vcpkg" OR ARROW_PACKAGE_KIND STREQUAL "conan") + set(find_package_args "") + if(BrotliAlt_FIND_VERSION) + list(APPEND find_package_args ${BrotliAlt_FIND_VERSION}) + endif() + if(BrotliAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) + endif() + if(BrotliAlt_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) + endif() + if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") + find_package(BrotliAlt NAMES unofficial-brotli ${find_package_args}) + else() + find_package(BrotliAlt NAMES brotli ${find_package_args}) + endif() + set(Brotli_FOUND ${BrotliAlt_FOUND}) + if(BrotliAlt_FOUND) + if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") + add_library(Brotli::brotlicommon ALIAS unofficial::brotli::brotlicommon) + add_library(Brotli::brotlienc ALIAS unofficial::brotli::brotlienc) + add_library(Brotli::brotlidec ALIAS unofficial::brotli::brotlidec) + else() + add_library(Brotli::brotlicommon ALIAS brotli::brotlicommon) + add_library(Brotli::brotlienc ALIAS brotli::brotlienc) + add_library(Brotli::brotlidec ALIAS brotli::brotlidec) + endif() + return() + endif() +endif() if(ARROW_BROTLI_USE_SHARED) set(BROTLI_COMMON_LIB_NAMES @@ -111,10 +146,10 @@ else() endif() find_package_handle_standard_args( - Brotli REQUIRED_VARS BROTLI_COMMON_LIBRARY BROTLI_ENC_LIBRARY BROTLI_DEC_LIBRARY - BROTLI_INCLUDE_DIR) -if(Brotli_FOUND OR BROTLI_FOUND) - set(Brotli_FOUND TRUE) + BrotliAlt REQUIRED_VARS BROTLI_COMMON_LIBRARY BROTLI_ENC_LIBRARY BROTLI_DEC_LIBRARY + BROTLI_INCLUDE_DIR) +set(Brotli_FOUND ${BrotliAlt_FOUND}) +if(BrotliAlt_FOUND) add_library(Brotli::brotlicommon UNKNOWN IMPORTED) set_target_properties(Brotli::brotlicommon PROPERTIES IMPORTED_LOCATION "${BROTLI_COMMON_LIBRARY}" diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index da27d2afcdb..a00ff2c939d 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -41,8 +41,21 @@ set(CLANG_TOOLS_SEARCH_PATHS /usr/bin "C:/Program Files/LLVM/bin" # Windows, non-conda "$ENV{CONDA_PREFIX}/Library/bin") # Windows, conda -if(CLANG_TOOLS_BREW_PREFIX) - list(APPEND CLANG_TOOLS_SEARCH_PATHS "${CLANG_TOOLS_BREW}/bin") +if(APPLE) + find_program(BREW brew) + if(BREW) + execute_process(COMMAND ${BREW} --prefix "llvm@${ARROW_CLANG_TOOLS_VERSION_MAJOR}" + OUTPUT_VARIABLE CLANG_TOOLS_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT CLANG_TOOLS_BREW_PREFIX) + execute_process(COMMAND ${BREW} --prefix llvm + OUTPUT_VARIABLE CLANG_TOOLS_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() + if(CLANG_TOOLS_BREW_PREFIX) + list(APPEND CLANG_TOOLS_SEARCH_PATHS "${CLANG_TOOLS_BREW_PREFIX}/bin") + endif() + endif() endif() function(FIND_CLANG_TOOL NAME OUTPUT VERSION_CHECK_PATTERN) diff --git a/cpp/cmake_modules/FindGLOG.cmake b/cpp/cmake_modules/FindGLOG.cmake index d67eb005621..61b7d0694ef 100644 --- a/cpp/cmake_modules/FindGLOG.cmake +++ b/cpp/cmake_modules/FindGLOG.cmake @@ -17,6 +17,10 @@ # # find_package(GLOG) +if(GLOG_FOUND) + return() +endif() + find_package(PkgConfig QUIET) pkg_check_modules(GLOG_PC libglog) if(GLOG_PC_FOUND) diff --git a/cpp/cmake_modules/FindGandiva.cmake b/cpp/cmake_modules/FindGandiva.cmake deleted file mode 100644 index c533abed733..00000000000 --- a/cpp/cmake_modules/FindGandiva.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Gandiva (gandiva/arrow.h, libgandiva.a, libgandiva.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# GANDIVA_FOUND, whether Gandiva has been found -# GANDIVA_IMPORT_LIB, path to libgandiva's import library (Windows only) -# GANDIVA_INCLUDE_DIR, directory containing headers -# GANDIVA_LIBS, deprecated. Use GANDIVA_LIB_DIR instead -# GANDIVA_LIB_DIR, directory containing Gandiva libraries -# GANDIVA_SHARED_IMP_LIB, deprecated. Use GANDIVA_IMPORT_LIB instead -# GANDIVA_SHARED_LIB, path to libgandiva's shared library -# GANDIVA_SO_VERSION, shared object version of found Gandiva such as "100" -# GANDIVA_STATIC_LIB, path to libgandiva.a - -if(DEFINED GANDIVA_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(GANDIVA - "${ARROW_HOME}" - gandiva - gandiva/arrow.h - Gandiva - gandiva) - if(NOT GANDIVA_VERSION) - set(GANDIVA_VERSION "${ARROW_VERSION}") - endif() - set(GANDIVA_ABI_VERSION "${ARROW_ABI_VERSION}") - set(GANDIVA_SO_VERSION "${ARROW_SO_VERSION}") -endif() - -if("${GANDIVA_VERSION}" VERSION_EQUAL "${ARROW_VERSION}") - set(GANDIVA_VERSION_MATCH TRUE) -else() - set(GANDIVA_VERSION_MATCH FALSE) -endif() - -mark_as_advanced(GANDIVA_ABI_VERSION - GANDIVA_IMPORT_LIB - GANDIVA_INCLUDE_DIR - GANDIVA_LIBS - GANDIVA_LIB_DIR - GANDIVA_SHARED_IMP_LIB - GANDIVA_SHARED_LIB - GANDIVA_SO_VERSION - GANDIVA_STATIC_LIB - GANDIVA_VERSION - GANDIVA_VERSION_MATCH) - -find_package_handle_standard_args( - Gandiva - REQUIRED_VARS GANDIVA_INCLUDE_DIR GANDIVA_LIB_DIR GANDIVA_SO_VERSION - GANDIVA_VERSION_MATCH - VERSION_VAR GANDIVA_VERSION) -set(GANDIVA_FOUND ${Gandiva_FOUND}) - -if(Gandiva_FOUND AND NOT Gandiva_FIND_QUIETLY) - message(STATUS "Found the Gandiva by ${GANDIVA_FIND_APPROACH}") - message(STATUS "Found the Gandiva shared library: ${GANDIVA_SHARED_LIB}") - message(STATUS "Found the Gandiva import library: ${GANDIVA_IMPORT_LIB}") - message(STATUS "Found the Gandiva static library: ${GANDIVA_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index e96e89850ae..c44c4802284 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -19,6 +19,10 @@ # # find_package(LLVMAlt) +if(LLVMAlt_FOUND) + return() +endif() + if(DEFINED LLVM_ROOT) # if llvm source is set to conda then prefer conda llvm over system llvm even # if the system one is newer @@ -36,22 +40,30 @@ if(DEFINED LLVM_ROOT) endif() if(NOT LLVM_FOUND) - set(LLVM_HINTS ${LLVM_ROOT} ${LLVM_DIR} /usr/lib /usr/share) - if(LLVM_BREW_PREFIX) - list(APPEND LLVM_HINTS ${LLVM_BREW_PREFIX}) - endif() + foreach(ARROW_LLVM_VERSION ${ARROW_LLVM_VERSIONS}) + set(LLVM_HINTS ${LLVM_ROOT} ${LLVM_DIR} /usr/lib /usr/share) - foreach(HINT ${LLVM_HINTS}) - foreach(ARROW_LLVM_VERSION ${ARROW_LLVM_VERSIONS}) - find_package(LLVM - ${ARROW_LLVM_VERSION} - CONFIG - HINTS - ${HINT}) - if(LLVM_FOUND) - break() + if(APPLE) + find_program(BREW brew) + if(BREW) + string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_MAJOR + "${ARROW_LLVM_VERSION}") + execute_process(COMMAND ${BREW} --prefix "llvm@${ARROW_LLVM_VERSION_MAJOR}" + OUTPUT_VARIABLE LLVM_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + list(APPEND LLVM_HINTS ${LLVM_BREW_PREFIX}) endif() - endforeach() + endif() + + find_package(LLVM + ${ARROW_LLVM_VERSION} + CONFIG + HINTS + ${LLVM_HINTS}) + + if(LLVM_FOUND) + break() + endif() endforeach() endif() @@ -76,12 +88,14 @@ if(LLVM_FOUND) clang-${LLVM_VERSION_MAJOR} clang HINTS ${LLVM_TOOLS_BINARY_DIR}) - add_library(LLVM::LLVM_INTERFACE INTERFACE IMPORTED) - - set_target_properties(LLVM::LLVM_INTERFACE + add_library(LLVM::LLVM_HEADERS INTERFACE IMPORTED) + set_target_properties(LLVM::LLVM_HEADERS PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLVM_INCLUDE_DIRS}" - INTERFACE_COMPILE_FLAGS "${LLVM_DEFINITIONS}" - INTERFACE_LINK_LIBRARIES "${LLVM_LIBS}") + INTERFACE_COMPILE_FLAGS "${LLVM_DEFINITIONS}") + + add_library(LLVM::LLVM_LIBS INTERFACE IMPORTED) + set_target_properties(LLVM::LLVM_LIBS PROPERTIES INTERFACE_LINK_LIBRARIES + "${LLVM_LIBS}") endif() mark_as_advanced(CLANG_EXECUTABLE LLVM_LINK_EXECUTABLE) diff --git a/cpp/cmake_modules/FindNumPy.cmake b/cpp/cmake_modules/FindNumPy.cmake index c3daba149fd..cdca68a5f24 100644 --- a/cpp/cmake_modules/FindNumPy.cmake +++ b/cpp/cmake_modules/FindNumPy.cmake @@ -94,3 +94,13 @@ find_package_message(NUMPY "${NUMPY_INCLUDE_DIRS}${NUMPY_VERSION}") set(NUMPY_FOUND TRUE) + +add_library(Python3::NumPy INTERFACE IMPORTED) +if(CMAKE_VERSION VERSION_LESS 3.11) + set_target_properties(Python3::NumPy PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${NUMPY_INCLUDE_DIRS}" + INTERFACE_LINK_LIBRARIES Python3::Module) +else() + target_include_directories(Python3::NumPy INTERFACE ${NUMPY_INCLUDE_DIRS}) + target_link_libraries(Python3::NumPy INTERFACE Python3::Module) +endif() diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindORC.cmake index d45b1607833..aca915acc13 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindORC.cmake @@ -21,6 +21,10 @@ # ORC_STATIC_LIB, path to liborc.a # ORC_FOUND, whether orc has been found +if(ORC_FOUND) + return() +endif() + if(ORC_ROOT) find_library(ORC_STATIC_LIB NAMES orc diff --git a/cpp/cmake_modules/FindOpenSSLAlt.cmake b/cpp/cmake_modules/FindOpenSSLAlt.cmake index 603e7d066ed..f027eb1026d 100644 --- a/cpp/cmake_modules/FindOpenSSLAlt.cmake +++ b/cpp/cmake_modules/FindOpenSSLAlt.cmake @@ -15,40 +15,41 @@ # specific language governing permissions and limitations # under the License. -if(ARROW_OPENSSL_USE_SHARED) - # Find shared OpenSSL libraries. - set(OpenSSL_USE_STATIC_LIBS OFF) - set(OPENSSL_USE_STATIC_LIBS OFF) - find_package(OpenSSL) -else() - # Find static OpenSSL headers and libs - set(OpenSSL_USE_STATIC_LIBS ON) - set(OPENSSL_USE_STATIC_LIBS ON) - find_package(OpenSSL) -endif() - -if(OPENSSL_FOUND) - message(STATUS "OpenSSL found with ${OPENSSL_VERSION} version") - if(OPENSSL_VERSION LESS "1.1.0") - message(SEND_ERROR "The OpenSSL must be greater than or equal to 1.1.0") - endif() -else() - message(SEND_ERROR "Not found the OpenSSL library") +if(OpenSSLAlt_FOUND) + return() endif() -if(NOT GANDIVA_OPENSSL_LIBS) - if(WIN32) - if(CMAKE_VERSION VERSION_LESS 3.18) - set(GANDIVA_OPENSSL_LIBS OpenSSL::Crypto OpenSSL::SSL) +if(APPLE AND NOT OPENSSL_ROOT_DIR) + find_program(BREW brew) + if(BREW) + execute_process(COMMAND ${BREW} --prefix "openssl@1.1" + OUTPUT_VARIABLE OPENSSL11_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENSSL11_BREW_PREFIX) + set(OPENSSL_ROOT_DIR ${OPENSSL11_BREW_PREFIX}) else() - set(GANDIVA_OPENSSL_LIBS OpenSSL::Crypto OpenSSL::SSL OpenSSL::applink) + execute_process(COMMAND ${BREW} --prefix "openssl" + OUTPUT_VARIABLE OPENSSL_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENSSL_BREW_PREFIX) + set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX}) + endif() endif() - else() - set(GANDIVA_OPENSSL_LIBS OpenSSL::Crypto OpenSSL::SSL) endif() endif() -if(NOT GANDIVA_OPENSSL_INCLUDE_DIR) - set(GANDIVA_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR}) - message(STATUS "OpenSSL include dir: ${GANDIVA_OPENSSL_INCLUDE_DIR}") +set(find_package_args) +if(OpenSSLAlt_FIND_VERSION) + list(APPEND find_package_args ${OpenSSLAlt_FIND_VERSION}) endif() +if(OpenSSLAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +if(ARROW_OPENSSL_USE_SHARED) + set(OPENSSL_USE_STATIC_LIBS OFF) +else() + set(OPENSSL_USE_STATIC_LIBS ON) +endif() +find_package(OpenSSL ${find_package_args}) + +set(OpenSSLAlt_FOUND ${OPENSSL_FOUND}) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake deleted file mode 100644 index e071fc822b6..00000000000 --- a/cpp/cmake_modules/FindParquet.cmake +++ /dev/null @@ -1,126 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Parquet (parquet/api/reader.h, libparquet.a, libparquet.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# PARQUET_FOUND, whether Parquet has been found -# PARQUET_IMPORT_LIB, path to libparquet's import library (Windows only) -# PARQUET_INCLUDE_DIR, directory containing headers -# PARQUET_LIBS, deprecated. Use PARQUET_LIB_DIR instead -# PARQUET_LIB_DIR, directory containing Parquet libraries -# PARQUET_SHARED_IMP_LIB, deprecated. Use PARQUET_IMPORT_LIB instead -# PARQUET_SHARED_LIB, path to libparquet's shared library -# PARQUET_SO_VERSION, shared object version of found Parquet such as "100" -# PARQUET_STATIC_LIB, path to libparquet.a - -if(DEFINED PARQUET_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(NOT "$ENV{PARQUET_HOME}" STREQUAL "") - file(TO_CMAKE_PATH "$ENV{PARQUET_HOME}" PARQUET_HOME) -endif() - -if((NOT PARQUET_HOME) AND ARROW_HOME) - set(PARQUET_HOME ${ARROW_HOME}) -endif() - -if(ARROW_FOUND) - arrow_find_package(PARQUET - "${PARQUET_HOME}" - parquet - parquet/api/reader.h - Parquet - parquet) - if(PARQUET_HOME) - if(PARQUET_INCLUDE_DIR) - file(READ "${PARQUET_INCLUDE_DIR}/parquet/parquet_version.h" - PARQUET_VERSION_H_CONTENT) - arrow_extract_macro_value(PARQUET_VERSION_MAJOR "PARQUET_VERSION_MAJOR" - "${PARQUET_VERSION_H_CONTENT}") - arrow_extract_macro_value(PARQUET_VERSION_MINOR "PARQUET_VERSION_MINOR" - "${PARQUET_VERSION_H_CONTENT}") - arrow_extract_macro_value(PARQUET_VERSION_PATCH "PARQUET_VERSION_PATCH" - "${PARQUET_VERSION_H_CONTENT}") - if("${PARQUET_VERSION_MAJOR}" STREQUAL "" - OR "${PARQUET_VERSION_MINOR}" STREQUAL "" - OR "${PARQUET_VERSION_PATCH}" STREQUAL "") - set(PARQUET_VERSION "0.0.0") - else() - set(PARQUET_VERSION - "${PARQUET_VERSION_MAJOR}.${PARQUET_VERSION_MINOR}.${PARQUET_VERSION_PATCH}") - endif() - - arrow_extract_macro_value(PARQUET_SO_VERSION_QUOTED "PARQUET_SO_VERSION" - "${PARQUET_VERSION_H_CONTENT}") - string(REGEX REPLACE "^\"(.+)\"$" "\\1" PARQUET_SO_VERSION - "${PARQUET_SO_VERSION_QUOTED}") - arrow_extract_macro_value(PARQUET_FULL_SO_VERSION_QUOTED "PARQUET_FULL_SO_VERSION" - "${PARQUET_VERSION_H_CONTENT}") - string(REGEX REPLACE "^\"(.+)\"$" "\\1" PARQUET_FULL_SO_VERSION - "${PARQUET_FULL_SO_VERSION_QUOTED}") - endif() - else() - if(PARQUET_USE_CMAKE_PACKAGE_CONFIG) - find_package(Parquet CONFIG) - elseif(PARQUET_USE_PKG_CONFIG) - pkg_get_variable(PARQUET_SO_VERSION parquet so_version) - pkg_get_variable(PARQUET_FULL_SO_VERSION parquet full_so_version) - endif() - endif() - set(PARQUET_ABI_VERSION "${PARQUET_SO_VERSION}") -endif() - -mark_as_advanced(PARQUET_ABI_VERSION - PARQUET_IMPORT_LIB - PARQUET_INCLUDE_DIR - PARQUET_LIBS - PARQUET_LIB_DIR - PARQUET_SHARED_IMP_LIB - PARQUET_SHARED_LIB - PARQUET_SO_VERSION - PARQUET_STATIC_LIB - PARQUET_VERSION) - -find_package_handle_standard_args( - Parquet - REQUIRED_VARS PARQUET_INCLUDE_DIR PARQUET_LIB_DIR PARQUET_SO_VERSION - VERSION_VAR PARQUET_VERSION) -set(PARQUET_FOUND ${Parquet_FOUND}) - -if(Parquet_FOUND AND NOT Parquet_FIND_QUIETLY) - message(STATUS "Parquet version: ${PARQUET_VERSION} (${PARQUET_FIND_APPROACH})") - message(STATUS "Found the Parquet shared library: ${PARQUET_SHARED_LIB}") - message(STATUS "Found the Parquet import library: ${PARQUET_IMPORT_LIB}") - message(STATUS "Found the Parquet static library: ${PARQUET_STATIC_LIB}") -endif() diff --git a/cpp/cmake_modules/FindPlasma.cmake b/cpp/cmake_modules/FindPlasma.cmake deleted file mode 100644 index 2e634844c59..00000000000 --- a/cpp/cmake_modules/FindPlasma.cmake +++ /dev/null @@ -1,102 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# - Find Plasma (plasma/client.h, libplasma.a, libplasma.so) -# -# This module requires Arrow from which it uses -# arrow_find_package() -# -# This module defines -# PLASMA_EXECUTABLE, deprecated. Use PLASMA_STORE_SERVER instead -# PLASMA_FOUND, whether Plasma has been found -# PLASMA_IMPORT_LIB, path to libplasma's import library (Windows only) -# PLASMA_INCLUDE_DIR, directory containing headers -# PLASMA_LIBS, deprecated. Use PLASMA_LIB_DIR instead -# PLASMA_LIB_DIR, directory containing Plasma libraries -# PLASMA_SHARED_IMP_LIB, deprecated. Use PLASMA_IMPORT_LIB instead -# PLASMA_SHARED_LIB, path to libplasma's shared library -# PLASMA_SO_VERSION, shared object version of found Plasma such as "100" -# PLASMA_STATIC_LIB, path to libplasma.a -# PLASMA_STORE_SERVER, path to plasma-store-server - -if(DEFINED PLASMA_FOUND) - return() -endif() - -set(find_package_arguments) -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION) - list(APPEND find_package_arguments "${${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION}") -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) - list(APPEND find_package_arguments REQUIRED) -endif() -if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) - list(APPEND find_package_arguments QUIET) -endif() -find_package(Arrow ${find_package_arguments}) - -if(ARROW_FOUND) - arrow_find_package(PLASMA - "${ARROW_HOME}" - plasma - plasma/client.h - Plasma - plasma) - if(ARROW_HOME) - set(PLASMA_STORE_SERVER - ${ARROW_HOME}/bin/plasma-store-server${CMAKE_EXECUTABLE_SUFFIX}) - else() - if(PLASMA_USE_CMAKE_PACKAGE_CONFIG) - find_package(Plasma CONFIG) - elseif(PLASMA_USE_PKG_CONFIG) - pkg_get_variable(PLASMA_STORE_SERVER plasma plasma_store_server) - endif() - endif() - set(PLASMA_VERSION "${ARROW_VERSION}") - set(PLASMA_SO_VERSION "${ARROW_SO_VERSION}") - set(PLASMA_ABI_VERSION "${PLASMA_SO_VERSION}") - # For backward compatibility - set(PLASMA_EXECUTABLE "${PLASMA_STORE_SERVER}") - set(PLASMA_LIBS "${PLASMA_LIB_DIR}") -endif() - -mark_as_advanced(PLASMA_ABI_VERSION - PLASMA_EXECUTABLE - PLASMA_IMPORT_LIB - PLASMA_INCLUDE_DIR - PLASMA_LIBS - PLASMA_LIB_DIR - PLASMA_SHARED_IMP_LIB - PLASMA_SHARED_LIB - PLASMA_SO_VERSION - PLASMA_STATIC_LIB - PLASMA_STORE_SERVER - PLASMA_VERSION) - -find_package_handle_standard_args( - Plasma - REQUIRED_VARS PLASMA_INCLUDE_DIR PLASMA_LIB_DIR PLASMA_SO_VERSION PLASMA_STORE_SERVER - VERSION_VAR PLASMA_VERSION) -set(PLASMA_FOUND ${Plasma_FOUND}) - -if(Plasma_FOUND AND NOT Plasma_FIND_QUIETLY) - message(STATUS "Found the Plasma by ${PLASMA_FIND_APPROACH}") - message(STATUS "Found the plasma-store-server: ${PLASMA_STORE_SERVER}") - message(STATUS "Found the Plasma shared library: ${PLASMA_SHARED_LIB}") - message(STATUS "Found the Plasma import library: ${PLASMA_IMPORT_LIB}") - message(STATUS "Found the Plasma static library: ${PLASMA_STATIC_LIB}") -endif() diff --git a/cpp/src/arrow/python/util/CMakeLists.txt b/cpp/cmake_modules/FindProtobufAlt.cmake similarity index 64% rename from cpp/src/arrow/python/util/CMakeLists.txt rename to cpp/cmake_modules/FindProtobufAlt.cmake index 74141bebc8b..d29f757aeb6 100644 --- a/cpp/src/arrow/python/util/CMakeLists.txt +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. -# -# arrow/python_test_main -# - -if(PYARROW_BUILD_TESTS) - add_library(arrow/python_test_main STATIC test_main.cc) +if(ARROW_PROTOBUF_USE_SHARED) + set(Protobuf_USE_STATIC_LIBS OFF) +else() + set(Protobuf_USE_STATIC_LIBS ON) +endif() - if(APPLE) - target_link_libraries(arrow/python_test_main GTest::gtest dl) - set_target_properties(arrow/python_test_main PROPERTIES LINK_FLAGS - "-undefined dynamic_lookup") - else() - target_link_libraries(arrow/python_test_main GTest::gtest pthread dl) - endif() +set(find_package_args) +if(ProtobufAlt_FIND_VERSION) + list(APPEND find_package_args ${ProtobufAlt_FIND_VERSION}) +endif() +if(ProtobufAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) endif() +find_package(Protobuf ${find_package_args}) +set(ProtobufAlt_FOUND ${Protobuf_FOUND}) diff --git a/cpp/cmake_modules/FindPython3Alt.cmake b/cpp/cmake_modules/FindPython3Alt.cmake index b003bb6a46f..0cc7fba3997 100644 --- a/cpp/cmake_modules/FindPython3Alt.cmake +++ b/cpp/cmake_modules/FindPython3Alt.cmake @@ -23,6 +23,10 @@ # - PYTHON_OTHER_LIBS # - NUMPY_INCLUDE_DIRS +if(Python3Alt_FOUND) + return() +endif() + set(Python3Alt_FIND_PACKAGE_OPTIONS) set(Python3Alt_NumPy_FIND_PACKAGE_OPTIONS) if(Python3Alt_FIND_VERSION) diff --git a/cpp/cmake_modules/FindPythonLibsNew.cmake b/cpp/cmake_modules/FindPythonLibsNew.cmake index 581bba9d4ca..b13cb35c9c4 100644 --- a/cpp/cmake_modules/FindPythonLibsNew.cmake +++ b/cpp/cmake_modules/FindPythonLibsNew.cmake @@ -217,6 +217,16 @@ find_package_message(PYTHON "Found PythonLibs: ${PYTHON_LIBRARY}" "${PYTHON_EXECUTABLE}${PYTHON_VERSION}") +add_library(Python3::Module SHARED IMPORTED) +if(CMAKE_VERSION VERSION_LESS 3.11) + set_target_properties(Python3::Module PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${PYTHON_INCLUDE_DIRS}) +else() + target_include_directories(Python3::Module INTERFACE ${PYTHON_INCLUDE_DIRS}) +endif() +set_target_properties(Python3::Module PROPERTIES + IMPORTED_LOCATION "${PYTHON_LIBRARIES}" + IMPORTED_IMPLIB "${PYTHON_LIBRARIES}") # PYTHON_ADD_MODULE( src1 src2 ... srcN) is used to build modules for python. FUNCTION(PYTHON_ADD_MODULE _NAME ) diff --git a/cpp/cmake_modules/FindRapidJSONAlt.cmake b/cpp/cmake_modules/FindRapidJSONAlt.cmake index 9a449a5280e..ef5acf18b82 100644 --- a/cpp/cmake_modules/FindRapidJSONAlt.cmake +++ b/cpp/cmake_modules/FindRapidJSONAlt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(RapidJSONAlt_FOUND) + return() +endif() + set(find_package_args) if(RapidJSONAlt_FIND_VERSION) list(APPEND find_package_args ${RapidJSONAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/FindSQLite3Alt.cmake b/cpp/cmake_modules/FindSQLite3Alt.cmake index 73a45f098c6..b60939841ef 100644 --- a/cpp/cmake_modules/FindSQLite3Alt.cmake +++ b/cpp/cmake_modules/FindSQLite3Alt.cmake @@ -26,6 +26,10 @@ # Usage of this module as follows: # find_package(SQLite3Alt) +if(FindSQLite3Alt_FOUND) + return() +endif() + find_path(SQLite3_INCLUDE_DIR sqlite3.h) find_library(SQLite3_LIBRARY NAMES sqlite3) diff --git a/cpp/cmake_modules/FindSnappyAlt.cmake b/cpp/cmake_modules/FindSnappyAlt.cmake index aee5eac4bc7..4d313400647 100644 --- a/cpp/cmake_modules/FindSnappyAlt.cmake +++ b/cpp/cmake_modules/FindSnappyAlt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(SnappyAlt_FOUND) + return() +endif() + set(find_package_args) if(SnappyAlt_FIND_VERSION) list(APPEND find_package_args ${SnappyAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/FindThrift.cmake b/cpp/cmake_modules/FindThriftAlt.cmake similarity index 55% rename from cpp/cmake_modules/FindThrift.cmake rename to cpp/cmake_modules/FindThriftAlt.cmake index 07028971d9f..f3e49021d57 100644 --- a/cpp/cmake_modules/FindThrift.cmake +++ b/cpp/cmake_modules/FindThriftAlt.cmake @@ -28,21 +28,56 @@ # thrift::thrift, a library target to use Thrift # thrift::compiler, a executable target to use Thrift compiler -function(EXTRACT_THRIFT_VERSION) - if(THRIFT_INCLUDE_DIR) - file(READ "${THRIFT_INCLUDE_DIR}/thrift/config.h" THRIFT_CONFIG_H_CONTENT) +if(ThriftAlt_FOUND) + return() +endif() + +# There are some problems in ThriftConfig.cmake provided by MSYS2 and +# conda on Windows: +# +# * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +# * https://github.com/msys2/MINGW-packages/issues/6619#issuecomment-649728718 +# +# We can remove the following "if(NOT WIN32)" condition once the +# followings are fixed and a new version that includes these fixes is +# published by MSYS2 and conda: +# +# * https://github.com/apache/thrift/pull/2725 +# * https://github.com/apache/thrift/pull/2726 +# * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +if(NOT WIN32) + set(find_package_args "") + if(ThriftAlt_FIND_VERSION) + list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) + endif() + if(ThriftAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) + endif() + find_package(Thrift ${find_package_args}) + if(Thrift_FOUND) + set(ThriftAlt_FOUND TRUE) + add_executable(thrift::compiler IMPORTED) + set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION + "${THRIFT_COMPILER}") + return() + endif() +endif() + +function(extract_thrift_version) + if(ThriftAlt_INCLUDE_DIR) + file(READ "${ThriftAlt_INCLUDE_DIR}/thrift/config.h" THRIFT_CONFIG_H_CONTENT) string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" THRIFT_VERSION_DEFINITION "${THRIFT_CONFIG_H_CONTENT}") - string(REGEX MATCH "[0-9.]+" Thrift_VERSION "${THRIFT_VERSION_DEFINITION}") - set(Thrift_VERSION - "${Thrift_VERSION}" + string(REGEX MATCH "[0-9.]+" ThriftAlt_VERSION "${THRIFT_VERSION_DEFINITION}") + set(ThriftAlt_VERSION + "${ThriftAlt_VERSION}" PARENT_SCOPE) else() - set(Thrift_VERSION + set(ThriftAlt_VERSION "" PARENT_SCOPE) endif() -endfunction(EXTRACT_THRIFT_VERSION) +endfunction() if(MSVC_TOOLCHAIN AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) if(NOT ARROW_THRIFT_USE_SHARED) @@ -61,32 +96,30 @@ if(MSVC_TOOLCHAIN AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) endif() endif() endif() -set(THRIFT_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") +set(ThriftAlt_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") if(ARROW_THRIFT_USE_SHARED) - set(THRIFT_LIB_NAMES thrift) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + set(ThriftAlt_LIB_NAME + "${CMAKE_IMPORT_LIBRARY_PREFIX}${ThriftAlt_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + ) + else() + set(ThriftAlt_LIB_NAME + "${CMAKE_SHARED_LIBRARY_PREFIX}${ThriftAlt_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" ) endif() - list(APPEND - THRIFT_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" - ) else() - set(THRIFT_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + set(ThriftAlt_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}${ThriftAlt_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) endif() if(Thrift_ROOT) - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} + find_library(ThriftAlt_LIB + NAMES ${ThriftAlt_LIB_NAME} PATHS ${Thrift_ROOT} PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") - find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h + find_path(ThriftAlt_INCLUDE_DIR thrift/Thrift.h PATHS ${Thrift_ROOT} PATH_SUFFIXES "include") find_program(THRIFT_COMPILER thrift @@ -99,24 +132,24 @@ else() find_package(PkgConfig QUIET) pkg_check_modules(THRIFT_PC thrift) if(THRIFT_PC_FOUND) - set(THRIFT_INCLUDE_DIR "${THRIFT_PC_INCLUDEDIR}") + set(ThriftAlt_INCLUDE_DIR "${THRIFT_PC_INCLUDEDIR}") list(APPEND THRIFT_PC_LIBRARY_DIRS "${THRIFT_PC_LIBDIR}") - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} + find_library(ThriftAlt_LIB + NAMES ${ThriftAlt_LIB_NAME} PATHS ${THRIFT_PC_LIBRARY_DIRS} NO_DEFAULT_PATH) find_program(THRIFT_COMPILER thrift HINTS ${THRIFT_PC_PREFIX} NO_DEFAULT_PATH PATH_SUFFIXES "bin") - set(Thrift_VERSION ${THRIFT_PC_VERSION}) + set(ThriftAlt_VERSION ${THRIFT_PC_VERSION}) else() - find_library(THRIFT_LIB - NAMES ${THRIFT_LIB_NAMES} + find_library(ThriftAlt_LIB + NAMES ${ThriftAlt_LIB_NAME} PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib") - find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h PATH_SUFFIXES "include") + find_path(ThriftAlt_INCLUDE_DIR thrift/Thrift.h PATH_SUFFIXES "include") find_program(THRIFT_COMPILER thrift PATH_SUFFIXES "bin") extract_thrift_version() endif() @@ -129,20 +162,29 @@ else() endif() find_package_handle_standard_args( - Thrift - REQUIRED_VARS THRIFT_LIB THRIFT_INCLUDE_DIR - VERSION_VAR Thrift_VERSION + ThriftAlt + REQUIRED_VARS ThriftAlt_LIB ThriftAlt_INCLUDE_DIR + VERSION_VAR ThriftAlt_VERSION HANDLE_COMPONENTS) -if(Thrift_FOUND) - if(ARROW_THRIFT_USE_SHARED) - add_library(thrift::thrift SHARED IMPORTED) - else() - add_library(thrift::thrift STATIC IMPORTED) +if(ThriftAlt_FOUND) + set(Thrift_VERSION ${ThriftAlt_VERSION}) + set(ThriftAlt_IMPORTED_PROPERTY_NAME IMPORTED_LOCATION) + # Reuse partially defined thrift::thrift by ThriftConfig.cmake. + if(NOT TARGET thrift::thrift) + if(ARROW_THRIFT_USE_SHARED) + add_library(thrift::thrift SHARED IMPORTED) + if(CMAKE_IMPORT_LIBRARY_SUFFIX) + set(ThriftAlt_IMPORTED_PROPERTY_NAME IMPORTED_IMPLIB) + endif() + else() + add_library(thrift::thrift STATIC IMPORTED) + endif() endif() set_target_properties(thrift::thrift - PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${THRIFT_INCLUDE_DIR}") + PROPERTIES ${ThriftAlt_IMPORTED_PROPERTY_NAME} "${ThriftAlt_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${ThriftAlt_INCLUDE_DIR}") if(WIN32 AND NOT MSVC_TOOLCHAIN) # We don't need this for Visual C++ because Thrift uses # "#pragma comment(lib, "Ws2_32.lib")" in diff --git a/cpp/cmake_modules/Findc-aresAlt.cmake b/cpp/cmake_modules/Findc-aresAlt.cmake index 5213e8d12a1..152c843e373 100644 --- a/cpp/cmake_modules/Findc-aresAlt.cmake +++ b/cpp/cmake_modules/Findc-aresAlt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(c-aresAlt_FOUND) + return() +endif() + set(find_package_args) if(c-aresAlt_FIND_VERSION) list(APPEND find_package_args ${c-aresAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 9bef477c13d..4e38605235b 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -11,6 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +if(gRPCAlt_FOUND) + return() +endif() + set(find_package_args) if(gRPCAlt_FIND_VERSION) list(APPEND find_package_args ${gRPCAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/FindgflagsAlt.cmake b/cpp/cmake_modules/FindgflagsAlt.cmake index e092ea3e9b9..40733ee9bc4 100644 --- a/cpp/cmake_modules/FindgflagsAlt.cmake +++ b/cpp/cmake_modules/FindgflagsAlt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(gflagsAlt_FOUND) + return() +endif() + set(find_package_args) if(gflagsAlt_FIND_VERSION) list(APPEND find_package_args ${gflagsAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/Findjemalloc.cmake b/cpp/cmake_modules/FindjemallocAlt.cmake similarity index 55% rename from cpp/cmake_modules/Findjemalloc.cmake rename to cpp/cmake_modules/FindjemallocAlt.cmake index db30f71d251..49616425db4 100644 --- a/cpp/cmake_modules/Findjemalloc.cmake +++ b/cpp/cmake_modules/FindjemallocAlt.cmake @@ -14,31 +14,53 @@ # # Usage of this module as follows: # -# find_package(jemalloc) +# find_package(jemallocAlt) # # This module defines # jemalloc::jemalloc, target to use jemalloc +if(jemallocAlt_FOUND) + return() +endif() + +if(ARROW_PACKAGE_KIND STREQUAL "conan") + set(find_package_args "") + if(jemallocAlt_FIND_VERSION) + list(APPEND find_package_args ${jemallocAlt_FIND_VERSION}) + endif() + if(jemallocAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) + endif() + if(jemallocAlt_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) + endif() + find_package(jemallocAlt NAMES jemalloc ${find_package_args}) + set(jemalloc_FOUND ${jemallocAlt_FOUND}) + if(jemallocAlt_FOUND) + return() + endif() +endif() + if(ARROW_JEMALLOC_USE_SHARED) - set(jemalloc_LIB_NAMES) + set(jemallocAlt_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND jemalloc_LIB_NAMES + list(APPEND jemallocAlt_LIB_NAMES "${CMAKE_IMPORT_LIBRARY_PREFIX}jemalloc${CMAKE_IMPORT_LIBRARY_SUFFIX}") endif() - list(APPEND jemalloc_LIB_NAMES + list(APPEND jemallocAlt_LIB_NAMES "${CMAKE_SHARED_LIBRARY_PREFIX}jemalloc${CMAKE_SHARED_LIBRARY_SUFFIX}") else() - set(jemalloc_LIB_NAMES + set(jemallocAlt_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}jemalloc${CMAKE_STATIC_LIBRARY_SUFFIX}") endif() if(jemalloc_ROOT) - find_library(jemalloc_LIB - NAMES ${jemallc_LIB_NAMES} - PATHS ${jemallc_ROOT} + find_library(jemallocAlt_LIB + NAMES ${jemallocAlt_LIB_NAMES} + PATHS ${jemalloc_ROOT} PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} NO_DEFAULT_PATH) - find_path(jemalloc_INCLUDE_DIR + find_path(jemallocAlt_INCLUDE_DIR NAMES jemalloc/jemalloc.h PATHS ${jemalloc_ROOT} NO_DEFAULT_PATH @@ -46,29 +68,29 @@ if(jemalloc_ROOT) else() find_package(PkgConfig QUIET) - pkg_check_modules(jemalloc_PC jemalloc) - if(jemalloc_PC_FOUND) - set(jemalloc_INCLUDE_DIR "${jemalloc_PC_INCLUDEDIR}") - list(APPEND jemalloc_PC_LIBRARY_DIRS "${jemalloc_PC_LIBDIR}") - find_library(jemalloc_LIB - NAMES ${jemalloc_LIB_NAMES} - PATHS ${jemalloc_PC_LIBRARY_DIRS} + pkg_check_modules(jemallocAlt_PC jemalloc) + if(jemallocAlt_PC_FOUND) + set(jemallocAlt_INCLUDE_DIR "${jemallocAlt_PC_INCLUDEDIR}") + list(APPEND jemallocAlt_PC_LIBRARY_DIRS "${jemallocAlt_PC_LIBDIR}") + find_library(jemallocAlt_LIB + NAMES ${jemallocAlt_LIB_NAMES} + PATHS ${jemallocAlt_PC_LIBRARY_DIRS} NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) else() - find_library(jemalloc_LIB - NAMES ${jemalloc_LIB_NAMES} + find_library(jemallocAlt_LIB + NAMES ${jemallocAlt_LIB_NAMES} PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(jemalloc_INCLUDE_DIR + find_path(jemallocAlt_INCLUDE_DIR NAMES jemalloc/jemalloc.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() endif() -find_package_handle_standard_args(jemalloc REQUIRED_VARS jemalloc_LIB - jemalloc_INCLUDE_DIR) - -if(jemalloc_FOUND) +find_package_handle_standard_args(jemallocAlt REQUIRED_VARS jemallocAlt_LIB + jemallocAlt_INCLUDE_DIR) +set(jemalloc_FOUND ${jemallocAlt_FOUND}) +if(jemallocAlt_FOUND) if(NOT TARGET jemalloc::jemalloc) if(ARROW_JEMALLOC_USE_SHARED) add_library(jemalloc::jemalloc SHARED IMPORTED) @@ -76,8 +98,8 @@ if(jemalloc_FOUND) add_library(jemalloc::jemalloc STATIC IMPORTED) endif() set_target_properties(jemalloc::jemalloc - PROPERTIES IMPORTED_LOCATION "${jemalloc_LIB}" + PROPERTIES IMPORTED_LOCATION "${jemallocAlt_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${jemalloc_INCLUDE_DIR}") + "${jemallocAlt_INCLUDE_DIR}") endif() endif() diff --git a/cpp/cmake_modules/Findlibrados.cmake b/cpp/cmake_modules/Findlibrados.cmake index 695d73fae1c..b993dbff114 100644 --- a/cpp/cmake_modules/Findlibrados.cmake +++ b/cpp/cmake_modules/Findlibrados.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(librados_FOUND) + return() +endif() + find_path(LIBRADOS_INCLUDE_DIR rados/librados.hpp) find_library(LIBRADOS_LIBRARY NAMES rados) diff --git a/cpp/cmake_modules/Findlz4Alt.cmake b/cpp/cmake_modules/Findlz4Alt.cmake index 186fec7e40a..77a22957f79 100644 --- a/cpp/cmake_modules/Findlz4Alt.cmake +++ b/cpp/cmake_modules/Findlz4Alt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(lz4Alt_FOUND) + return() +endif() + set(find_package_args) if(lz4Alt_FIND_VERSION) list(APPEND find_package_args ${lz4Alt_FIND_VERSION}) @@ -25,6 +29,10 @@ endif() find_package(lz4 ${find_package_args}) if(lz4_FOUND) set(lz4Alt_FOUND TRUE) + # Conan uses lz4::lz4 not LZ4::lz4 + if(NOT TARGET LZ4::lz4 AND TARGET lz4::lz4) + add_library(LZ4::lz4 ALIAS lz4::lz4) + endif() return() endif() @@ -89,9 +97,9 @@ endif() find_package_handle_standard_args(lz4Alt REQUIRED_VARS LZ4_LIB LZ4_INCLUDE_DIR) if(lz4Alt_FOUND) - if(NOT TARGET lz4::lz4) - add_library(lz4::lz4 UNKNOWN IMPORTED) - set_target_properties(lz4::lz4 + if(NOT TARGET LZ4::lz4) + add_library(LZ4::lz4 UNKNOWN IMPORTED) + set_target_properties(LZ4::lz4 PROPERTIES IMPORTED_LOCATION "${LZ4_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}") endif() diff --git a/cpp/cmake_modules/Findre2Alt.cmake b/cpp/cmake_modules/Findre2Alt.cmake index f66e35cf046..1fe7a921f6b 100644 --- a/cpp/cmake_modules/Findre2Alt.cmake +++ b/cpp/cmake_modules/Findre2Alt.cmake @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +if(re2Alt_FOUND) + return() +endif() + set(find_package_args) if(re2Alt_FIND_VERSION) list(APPEND find_package_args ${re2Alt_FIND_VERSION}) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index 867361cd104..e3474140905 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -15,6 +15,28 @@ # specific language governing permissions and limitations # under the License. +if(utf8proc_FOUND) + return() +endif() + +if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") + set(find_package_args "") + if(utf8proc_FIND_VERSION) + list(APPEND find_package_args ${utf8proc_FIND_VERSION}) + endif() + if(utf8proc_FIND_QUIETLY) + list(APPEND find_package_args QUIET) + endif() + if(utf8proc_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) + endif() + find_package(utf8proc NAMES unofficial-utf8proc ${find_package_args}) + if(utf8proc_FOUND) + add_library(utf8proc::utf8proc ALIAS utf8proc) + return() + endif() +endif() + function(extract_utf8proc_version) if(utf8proc_INCLUDE_DIR) file(READ "${utf8proc_INCLUDE_DIR}/utf8proc.h" UTF8PROC_H_CONTENT) diff --git a/cpp/cmake_modules/Findzstd.cmake b/cpp/cmake_modules/FindzstdAlt.cmake similarity index 59% rename from cpp/cmake_modules/Findzstd.cmake rename to cpp/cmake_modules/FindzstdAlt.cmake index 3fc14ec0d72..980cf265521 100644 --- a/cpp/cmake_modules/Findzstd.cmake +++ b/cpp/cmake_modules/FindzstdAlt.cmake @@ -15,6 +15,23 @@ # specific language governing permissions and limitations # under the License. +if(zstdAlt_FOUND) + return() +endif() + +set(find_package_args) +if(zstdAlt_FIND_VERSION) + list(APPEND find_package_args ${zstdAlt_FIND_VERSION}) +endif() +if(zstdAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(zstd ${find_package_args}) +if(zstd_FOUND) + set(zstdAlt_FOUND TRUE) + return() +endif() + if(MSVC AND NOT DEFINED ZSTD_MSVC_LIB_PREFIX) set(ZSTD_MSVC_LIB_PREFIX "lib") endif() @@ -60,6 +77,7 @@ else() find_package(PkgConfig QUIET) pkg_check_modules(ZSTD_PC libzstd) if(ZSTD_PC_FOUND) + set(zstdAlt_VERSION "${ZSTD_PC_VERSION}") set(ZSTD_INCLUDE_DIR "${ZSTD_PC_INCLUDEDIR}") list(APPEND ZSTD_PC_LIBRARY_DIRS "${ZSTD_PC_LIBDIR}") @@ -79,11 +97,46 @@ else() endif() endif() -find_package_handle_standard_args(zstd REQUIRED_VARS ZSTD_LIB ZSTD_INCLUDE_DIR) +if("${zstdAlt_VERSION}" STREQUAL "" AND ZSTD_INCLUDE_DIR) + file(READ "${ZSTD_INCLUDE_DIR}/zstd.h" ZSTD_H_CONTENT) + string(REGEX MATCH "#define ZSTD_VERSION_MAJOR +([0-9]+)" ZSTD_VERSION_MAJOR_DEFINITION + "${ZSTD_H_CONTENT}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" ZSTD_VERSION_MAJOR + "${ZSTD_VERSION_MAJOR_DEFINITION}") + string(REGEX MATCH "#define ZSTD_VERSION_MINOR +([0-9]+)" ZSTD_VERSION_MINOR_DEFINITION + "${ZSTD_H_CONTENT}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" ZSTD_VERSION_MINOR + "${ZSTD_VERSION_MINOR_DEFINITION}") + string(REGEX MATCH "#define ZSTD_VERSION_RELEASE +([0-9]+)" + ZSTD_VERSION_RELEASE_DEFINITION "${ZSTD_H_CONTENT}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" ZSTD_VERSION_RELEASE + "${ZSTD_VERSION_RELEASE_DEFINITION}") + if("${ZSTD_VERSION_MAJOR}" STREQUAL "" + OR "${ZSTD_VERSION_MINOR}" STREQUAL "" + OR "${ZSTD_VERSION_RELEASE}" STREQUAL "") + set(zstdAlt_VERSION "0.0.0") + else() + set(zstdAlt_VERSION + "${ZSTD_VERSION_MAJOR}.${ZSTD_VERSION_MINOR}.${ZSTD_VERSION_RELEASE}") + endif() +endif() -if(zstd_FOUND) - add_library(zstd::libzstd UNKNOWN IMPORTED) - set_target_properties(zstd::libzstd +find_package_handle_standard_args( + zstdAlt + REQUIRED_VARS ZSTD_LIB ZSTD_INCLUDE_DIR + VERSION_VAR zstdAlt_VERSION) + +if(zstdAlt_FOUND) + if(ARROW_ZSTD_USE_SHARED) + set(zstd_TARGET zstd::libzstd_shared) + add_library(${zstd_TARGET} SHARED IMPORTED) + else() + set(zstd_TARGET zstd::libzstd_static) + add_library(${zstd_TARGET} STATIC IMPORTED) + endif() + set_target_properties(${zstd_TARGET} PROPERTIES IMPORTED_LOCATION "${ZSTD_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_INCLUDE_DIR}") + message(STATUS "Zstandard library: ${ZSTD_LIB}") + message(STATUS "Zstandard include directory: ${ZSTD_INCLUDE_DIR}") endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 0a40ebe48a9..b8ca029c65e 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -24,16 +24,20 @@ include(CheckCXXSourceCompiles) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") if(NOT DEFINED ARROW_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") - set(ARROW_CPU_FLAG "armv8") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7") - set(ARROW_CPU_FLAG "armv7") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") + set(ARROW_CPU_FLAG "x86") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") + set(ARROW_CPU_FLAG "aarch64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") + set(ARROW_CPU_FLAG "aarch32") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "powerpc|ppc") set(ARROW_CPU_FLAG "ppc") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") set(ARROW_CPU_FLAG "s390x") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + set(ARROW_CPU_FLAG "riscv64") else() - set(ARROW_CPU_FLAG "x86") + message(FATAL_ERROR "Unknown system processor") endif() endif() @@ -104,10 +108,10 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc") if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT") set(ARROW_SIMD_LEVEL "NONE") endif() -elseif(ARROW_CPU_FLAG STREQUAL "armv8") +elseif(ARROW_CPU_FLAG STREQUAL "aarch64") # Arm64 compiler flags, gcc/clang only - set(ARROW_ARMV8_ARCH_FLAG "-march=${ARROW_ARMV8_ARCH}") - check_cxx_compiler_flag(${ARROW_ARMV8_ARCH_FLAG} CXX_SUPPORTS_ARMV8_ARCH) + set(ARROW_ARMV8_MARCH "armv8-a") + check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT") set(ARROW_SIMD_LEVEL "NEON") endif() @@ -118,12 +122,14 @@ if(NOT DEFINED CMAKE_C_STANDARD) set(CMAKE_C_STANDARD 11) endif() -# This ensures that things like c++11 get passed correctly +# This ensures that things like c++17 get passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) +elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 17) + message(FATAL_ERROR "Cannot set a CMAKE_CXX_STANDARD smaller than 17") endif() -# We require a C++11 compliant compiler +# We require a C++17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) # ARROW-6848: Do not use GNU (or other CXX) extensions @@ -201,6 +207,24 @@ if(WIN32) # * https://developercommunity.visualstudio.com/content/problem/1249671/stdc17-generates-warning-compiling-windowsh.html set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd5105") + if(ARROW_USE_CCACHE) + foreach(c_flag + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS + CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_MINSIZEREL + CMAKE_C_FLAGS_RELWITHDEBINFO) + # ccache doesn't work with /Zi. + # See also: https://github.com/ccache/ccache/issues/1040 + string(REPLACE "/Zi" "/Z7" ${c_flag} "${${c_flag}}") + endforeach() + endif() + if(ARROW_USE_STATIC_CRT) foreach(c_flag CMAKE_CXX_FLAGS @@ -213,7 +237,7 @@ if(WIN32) CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - string(REPLACE "/MD" "-MT" ${c_flag} "${${c_flag}}") + string(REPLACE "/MD" "/MT" ${c_flag} "${${c_flag}}") endforeach() endif() @@ -259,13 +283,13 @@ string(TOUPPER ${BUILD_WARNING_LEVEL} BUILD_WARNING_LEVEL) message(STATUS "Arrow build warning level: ${BUILD_WARNING_LEVEL}") macro(arrow_add_werror_if_debug) - if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - # Treat all compiler warnings as errors - if(MSVC) - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /WX") - else() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Werror") - endif() + # Treat all compiler warnings as errors + if(MSVC) + string(APPEND CMAKE_C_FLAGS_DEBUG " /WX") + string(APPEND CMAKE_CXX_FLAGS_DEBUG " /WX") + else() + string(APPEND CMAKE_C_FLAGS_DEBUG " -Werror") + string(APPEND CMAKE_CXX_FLAGS_DEBUG " -Werror") endif() endmacro() @@ -376,22 +400,13 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -Wno-noexcept-type") endif() - if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.2") - # Disabling semantic interposition allows faster calling conventions - # when calling global functions internally, and can also help inlining. - # See https://stackoverflow.com/questions/35745543/new-option-in-gcc-5-3-fno-semantic-interposition - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-semantic-interposition") - endif() + # Disabling semantic interposition allows faster calling conventions + # when calling global functions internally, and can also help inlining. + # See https://stackoverflow.com/questions/35745543/new-option-in-gcc-5-3-fno-semantic-interposition + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-semantic-interposition") - if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9") - # Add colors when paired with ninja - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") - endif() - - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0") - # Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43407 - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes") - endif() + # Add colors when paired with ninja + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") if(CMAKE_UNITY_BUILD) # Work around issue similar to https://bugs.webkit.org/show_bug.cgi?id=176869 @@ -418,11 +433,11 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STRE # Don't complain about optimization passes that were not possible set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-pass-failed") - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") - # Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be - # the default standard library which does not support C++11. libc++ is the - # default from 10.9 onward. - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -stdlib=libc++") + # Avoid clang / libc++ error about C++17 aligned allocation on macOS. + # See https://chromium.googlesource.com/chromium/src/+/eee44569858fc650b635779c4e34be5cb0c73186%5E%21/#F0 + # for details. + if(APPLE) + set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -fno-aligned-new") endif() endif() @@ -469,33 +484,28 @@ if(ARROW_CPU_FLAG STREQUAL "ppc") endif() endif() -if(ARROW_CPU_FLAG STREQUAL "armv8") - if(ARROW_SIMD_LEVEL STREQUAL "NEON") +if(ARROW_CPU_FLAG STREQUAL "aarch64") + if(ARROW_SIMD_LEVEL MATCHES "NEON|SVE[0-9]*") set(ARROW_HAVE_NEON ON) - - if(NOT CXX_SUPPORTS_ARMV8_ARCH) - message(FATAL_ERROR "Unsupported arch flag: ${ARROW_ARMV8_ARCH_FLAG}.") - endif() - if(ARROW_ARMV8_ARCH_FLAG MATCHES "native") - message(FATAL_ERROR "native arch not allowed, please specify arch explicitly.") - endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") - add_definitions(-DARROW_HAVE_NEON) - - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS - "5.4") - message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well." - ) - else() - if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto") - add_definitions(-DARROW_HAVE_ARMV8_CRYPTO) + if(ARROW_SIMD_LEVEL MATCHES "SVE[0-9]*") + if(NOT CXX_SUPPORTS_SVE) + message(FATAL_ERROR "SVE required but compiler doesn't support it.") endif() - # armv8.1+ implies crc support - if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc") - add_definitions(-DARROW_HAVE_ARMV8_CRC) + # -march=armv8-a+sve + set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve") + string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL}) + if(SVE_VECTOR_BITS) + set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON) + add_definitions(-DARROW_HAVE_SVE${SVE_VECTOR_BITS}) + # -msve-vector-bits=256 + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msve-vector-bits=${SVE_VECTOR_BITS}") + else() + set(ARROW_HAVE_SVE_SIZELESS ON) + add_definitions(-DARROW_HAVE_SVE_SIZELSS) endif() endif() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}") elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by Arm.") endif() @@ -594,57 +604,42 @@ endif() # For all builds: # For CMAKE_BUILD_TYPE=Debug # -ggdb: Enable gdb debugging -# For CMAKE_BUILD_TYPE=FastDebug -# Same as DEBUG, except with some optimizations on. # For CMAKE_BUILD_TYPE=Release -# -O3: Enable all compiler optimizations -# Debug symbols are stripped for reduced binary size. Add -# -DARROW_CXXFLAGS="-g" to add them +# -O2 (not -O3): Enable compiler optimizations +# Debug symbols are stripped for reduced binary size. +# For CMAKE_BUILD_TYPE=RelWithDebInfo +# Same as Release, except with debug symbols enabled. + if(NOT MSVC) - if(ARROW_GGDB_DEBUG) - set(ARROW_DEBUG_SYMBOL_TYPE "gdb") - set(C_FLAGS_DEBUG "-g${ARROW_DEBUG_SYMBOL_TYPE} -O0") - set(C_FLAGS_FASTDEBUG "-g${ARROW_DEBUG_SYMBOL_TYPE} -O1") - set(CXX_FLAGS_DEBUG "-g${ARROW_DEBUG_SYMBOL_TYPE} -O0") - set(CXX_FLAGS_FASTDEBUG "-g${ARROW_DEBUG_SYMBOL_TYPE} -O1") - else() - set(C_FLAGS_DEBUG "-g -O0") - set(C_FLAGS_FASTDEBUG "-g -O1") - set(CXX_FLAGS_DEBUG "-g -O0") - set(CXX_FLAGS_FASTDEBUG "-g -O1") + set(C_RELEASE_FLAGS "") + if(CMAKE_C_FLAGS_RELEASE MATCHES "-O3") + string(APPEND C_RELEASE_FLAGS " -O2") + endif() + set(CXX_RELEASE_FLAGS "") + if(CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3") + string(APPEND CXX_RELEASE_FLAGS " -O2") + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + string(APPEND C_RELEASE_FLAGS " -ftree-vectorize") + string(APPEND CXX_RELEASE_FLAGS " -ftree-vectorize") endif() - set(C_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG") -endif() + set(DEBUG_FLAGS "") + if(MSVC) + string(APPEND DEBUG_FLAGS " /Od") + else() + string(APPEND DEBUG_FLAGS " -O0") + endif() + if(ARROW_GGDB_DEBUG) + string(APPEND DEBUG_FLAGS " -ggdb") + endif() -set(C_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") -set(C_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") -set(CXX_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") -set(CXX_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") - -# Set compile flags based on the build type. -message(STATUS "Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})" -) -if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") -elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO") - -elseif("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_FASTDEBUG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") -elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_RELEASE}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_RELEASE}") -elseif("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_GEN") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_PROFILE_GEN}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_GEN}") -elseif("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_BUILD") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_PROFILE_BUILD}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_BUILD}") -else() - message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") + string(APPEND CMAKE_C_FLAGS_RELEASE "${C_RELEASE_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS_RELEASE "${CXX_RELEASE_FLAGS}") + string(APPEND CMAKE_C_FLAGS_DEBUG "${DEBUG_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS_DEBUG "${DEBUG_FLAGS}") + string(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO "${C_RELEASE_FLAGS} ${DEBUG_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CXX_RELEASE_FLAGS} ${DEBUG_FLAGS}") endif() message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 86ad0583531..3eda538fb2e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -40,12 +40,6 @@ set(ARROW_RE2_LINKAGE "static" CACHE STRING "How to link the re2 library. static|shared (default static)") -if(ARROW_PROTOBUF_USE_SHARED) - set(Protobuf_USE_STATIC_LIBS OFF) -else() - set(Protobuf_USE_STATIC_LIBS ON) -endif() - # ---------------------------------------------------------------------- # Resolve the dependencies @@ -216,18 +210,25 @@ endmacro() # Find modules are needed by the consumer in case of a static build, or if the # linkage is PUBLIC or INTERFACE. -macro(provide_find_module PACKAGE_NAME) +macro(provide_find_module PACKAGE_NAME ARROW_CMAKE_PACKAGE_NAME) set(module_ "${CMAKE_SOURCE_DIR}/cmake_modules/Find${PACKAGE_NAME}.cmake") if(EXISTS "${module_}") - message(STATUS "Providing CMake module for ${PACKAGE_NAME}") - install(FILES "${module_}" DESTINATION "${ARROW_CMAKE_DIR}") + message(STATUS "Providing CMake module for ${PACKAGE_NAME} as part of ${ARROW_CMAKE_PACKAGE_NAME} CMake package" + ) + install(FILES "${module_}" + DESTINATION "${ARROW_CMAKE_DIR}/${ARROW_CMAKE_PACKAGE_NAME}") endif() unset(module_) endmacro() macro(resolve_dependency DEPENDENCY_NAME) set(options) - set(one_value_args HAVE_ALT IS_RUNTIME_DEPENDENCY REQUIRED_VERSION USE_CONFIG) + set(one_value_args + FORCE_ANY_NEWER_VERSION + HAVE_ALT + IS_RUNTIME_DEPENDENCY + REQUIRED_VERSION + USE_CONFIG) set(multi_value_args COMPONENTS PC_PACKAGE_NAMES) cmake_parse_arguments(ARG "${options}" @@ -247,7 +248,7 @@ macro(resolve_dependency DEPENDENCY_NAME) set(PACKAGE_NAME ${DEPENDENCY_NAME}) endif() set(FIND_PACKAGE_ARGUMENTS ${PACKAGE_NAME}) - if(ARG_REQUIRED_VERSION) + if(ARG_REQUIRED_VERSION AND NOT ARG_FORCE_ANY_NEWER_VERSION) list(APPEND FIND_PACKAGE_ARGUMENTS ${ARG_REQUIRED_VERSION}) endif() if(ARG_USE_CONFIG) @@ -258,7 +259,16 @@ macro(resolve_dependency DEPENDENCY_NAME) endif() if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") find_package(${FIND_PACKAGE_ARGUMENTS}) - if(${${PACKAGE_NAME}_FOUND}) + set(COMPATIBLE ${${PACKAGE_NAME}_FOUND}) + if(COMPATIBLE + AND ARG_FORCE_ANY_NEWER_VERSION + AND ARG_REQUIRED_VERSION) + if(${${PACKAGE_NAME}_VERSION} VERSION_LESS ${ARG_REQUIRED_VERSION}) + message(DEBUG "Couldn't find ${DEPENDENCY_NAME} >= ${ARG_REQUIRED_VERSION}") + set(COMPATIBLE FALSE) + endif() + endif() + if(COMPATIBLE) set(${DEPENDENCY_NAME}_SOURCE "SYSTEM") else() build_dependency(${DEPENDENCY_NAME}) @@ -268,9 +278,14 @@ macro(resolve_dependency DEPENDENCY_NAME) build_dependency(${DEPENDENCY_NAME}) elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM") find_package(${FIND_PACKAGE_ARGUMENTS} REQUIRED) + if(ARG_FORCE_ANY_NEWER_VERSION AND ARG_REQUIRED_VERSION) + if(${${PACKAGE_NAME}_VERSION} VERSION_LESS ${ARG_REQUIRED_VERSION}) + message(FATAL_ERROR "Couldn't find ${DEPENDENCY_NAME} >= ${ARG_REQUIRED_VERSION}") + endif() + endif() endif() if(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM" AND ARG_IS_RUNTIME_DEPENDENCY) - provide_find_module(${PACKAGE_NAME}) + provide_find_module(${PACKAGE_NAME} "Arrow") list(APPEND ARROW_SYSTEM_DEPENDENCIES ${PACKAGE_NAME}) find_package(PkgConfig QUIET) foreach(ARG_PC_PACKAGE_NAME ${ARG_PC_PACKAGE_NAMES}) @@ -280,7 +295,12 @@ macro(resolve_dependency DEPENDENCY_NAME) NO_CMAKE_ENVIRONMENT_PATH QUIET) if(${${ARG_PC_PACKAGE_NAME}_PC_FOUND}) + message(STATUS "Using pkg-config package for ${ARG_PC_PACKAGE_NAME} for static link" + ) string(APPEND ARROW_PC_REQUIRES_PRIVATE " ${ARG_PC_PACKAGE_NAME}") + else() + message(STATUS "pkg-config package for ${ARG_PC_PACKAGE_NAME} for static link isn't found" + ) endif() endforeach() endif() @@ -632,18 +652,9 @@ endif() if(DEFINED ENV{ARROW_SNAPPY_URL}) set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}") else() - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS - "4.9") - # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 "SNAPPY_OLD" for those (ARROW-14661) - set_urls(SNAPPY_SOURCE_URL - "https://github.com/google/snappy/archive/${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz" - "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz") - set(ARROW_SNAPPY_BUILD_SHA256_CHECKSUM ${ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM}) - else() - set_urls(SNAPPY_SOURCE_URL - "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" - "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz") - endif() + set_urls(SNAPPY_SOURCE_URL + "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" + "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz") endif() if(DEFINED ENV{ARROW_SUBSTRAIT_URL}) @@ -716,16 +727,25 @@ endif() # ---------------------------------------------------------------------- # ExternalProject options -set(EP_CXX_FLAGS - "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" -) -set(EP_C_FLAGS - "${CMAKE_C_COMPILER_ARG1} ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") +set(EP_LIST_SEPARATOR "|") +set(EP_COMMON_OPTIONS LIST_SEPARATOR ${EP_LIST_SEPARATOR}) +set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(EP_C_FLAGS "${CMAKE_C_FLAGS}") if(NOT MSVC_TOOLCHAIN) # Set -fPIC on all external projects - set(EP_CXX_FLAGS "${EP_CXX_FLAGS} -fPIC") - set(EP_C_FLAGS "${EP_C_FLAGS} -fPIC") + string(APPEND EP_CXX_FLAGS " -fPIC") + string(APPEND EP_C_FLAGS " -fPIC") +endif() + +set(EP_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") +set(EP_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") +if(MSVC_TOOLCHAIN) + string(REPLACE "/WX" "" EP_CXX_FLAGS_DEBUG "${EP_CXX_FLAGS_DEBUG}") + string(REPLACE "/WX" "" EP_C_FLAGS_DEBUG "${EP_C_FLAGS_DEBUG}") +else() + string(APPEND EP_CXX_FLAGS_DEBUG " -Wno-error") + string(APPEND EP_C_FLAGS_DEBUG " -Wno-error") endif() # CC/CXX environment variables are captured on the first invocation of the @@ -733,15 +753,31 @@ endif() # directory. This leads to issues if the variables are exported in a subshell # and the invocation of make/ninja is in distinct subshell without the same # environment (CC/CXX). -set(EP_COMMON_TOOLCHAIN -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) +set(EP_C_COMPILER "${CMAKE_C_COMPILER}") +if(NOT CMAKE_VERSION VERSION_LESS 3.19) + if(CMAKE_C_COMPILER_ARG1) + separate_arguments(EP_C_COMPILER_ARGS NATIVE_COMMAND "${CMAKE_C_COMPILER_ARG1}") + list(APPEND EP_C_COMPILER ${EP_C_COMPILER_ARGS}) + endif() + string(REPLACE ";" ${EP_LIST_SEPARATOR} EP_C_COMPILER "${EP_C_COMPILER}") +endif() +set(EP_CXX_COMPILER "${CMAKE_CXX_COMPILER}") +if(NOT CMAKE_VERSION VERSION_LESS 3.19) + if(CMAKE_CXX_COMPILER_ARG1) + separate_arguments(EP_CXX_COMPILER_ARGS NATIVE_COMMAND "${CMAKE_CXX_COMPILER_ARG1}") + list(APPEND EP_CXX_COMPILER ${EP_CXX_COMPILER_ARGS}) + endif() + string(REPLACE ";" ${EP_LIST_SEPARATOR} EP_CXX_COMPILER "${EP_CXX_COMPILER}") +endif() +set(EP_COMMON_TOOLCHAIN "-DCMAKE_C_COMPILER=${EP_C_COMPILER}" + "-DCMAKE_CXX_COMPILER=${EP_CXX_COMPILER}") if(CMAKE_AR) - set(EP_COMMON_TOOLCHAIN ${EP_COMMON_TOOLCHAIN} -DCMAKE_AR=${CMAKE_AR}) + list(APPEND EP_COMMON_TOOLCHAIN -DCMAKE_AR=${CMAKE_AR}) endif() if(CMAKE_RANLIB) - set(EP_COMMON_TOOLCHAIN ${EP_COMMON_TOOLCHAIN} -DCMAKE_RANLIB=${CMAKE_RANLIB}) + list(APPEND EP_COMMON_TOOLCHAIN -DCMAKE_RANLIB=${CMAKE_RANLIB}) endif() # External projects are still able to override the following declarations. @@ -750,15 +786,24 @@ endif() # argument. set(EP_COMMON_CMAKE_ARGS ${EP_COMMON_TOOLCHAIN} - ${EP_COMMON_CMAKE_ARGS} + -DBUILD_SHARED_LIBS=OFF + -DBUILD_STATIC_LIBS=ON + -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_C_FLAGS=${EP_C_FLAGS} - -DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS} -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_DEBUG=${EP_CXX_FLAGS_DEBUG} + -DCMAKE_CXX_FLAGS_MISIZEREL=${CMAKE_CXX_FLAGS_MINSIZEREL} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_RELWITHDEBINFO=${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} + -DCMAKE_C_FLAGS=${EP_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${EP_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_MISIZEREL=${CMAKE_C_FLAGS_MINSIZEREL} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_C_FLAGS_RELWITHDEBINFO=${CMAKE_C_FLAGS_RELWITHDEBINFO} -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=${CMAKE_EXPORT_NO_PACKAGE_REGISTRY} -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=${CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY} + -DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) # Enable s/ccache if set by parent. @@ -769,20 +814,20 @@ if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) endif() if(NOT ARROW_VERBOSE_THIRDPARTY_BUILD) - set(EP_LOG_OPTIONS - LOG_CONFIGURE - 1 - LOG_BUILD - 1 - LOG_INSTALL - 1 - LOG_DOWNLOAD - 1 - LOG_OUTPUT_ON_FAILURE - 1) + list(APPEND + EP_COMMON_OPTIONS + LOG_CONFIGURE + 1 + LOG_BUILD + 1 + LOG_INSTALL + 1 + LOG_DOWNLOAD + 1 + LOG_OUTPUT_ON_FAILURE + 1) set(Boost_DEBUG FALSE) else() - set(EP_LOG_OPTIONS) set(Boost_DEBUG TRUE) endif() @@ -888,18 +933,19 @@ macro(build_boost) "${Boost_INCLUDE_DIR}") externalproject_add(boost_ep + ${EP_COMMON_OPTIONS} URL ${BOOST_SOURCE_URL} URL_HASH "SHA256=${ARROW_BOOST_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS ${BOOST_BUILD_PRODUCTS} BUILD_IN_SOURCE 1 CONFIGURE_COMMAND ${BOOST_CONFIGURE_COMMAND} BUILD_COMMAND ${BOOST_BUILD_COMMAND} - INSTALL_COMMAND "" ${EP_LOG_OPTIONS}) + INSTALL_COMMAND "") add_dependencies(Boost::system boost_ep) add_dependencies(Boost::filesystem boost_ep) else() externalproject_add(boost_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} BUILD_COMMAND "" CONFIGURE_COMMAND "" INSTALL_COMMAND "" @@ -1019,12 +1065,16 @@ if(ARROW_USE_BOOST) # Find static boost headers and libs set(Boost_USE_STATIC_LIBS ON) endif() + if(ARROW_BOOST_REQUIRE_LIBRARY) + set(ARROW_BOOST_COMPONENTS system filesystem) + else() + set(ARROW_BOOST_COMPONENTS) + endif() resolve_dependency(Boost REQUIRED_VERSION ${ARROW_BOOST_REQUIRED_VERSION} COMPONENTS - system - filesystem + ${ARROW_BOOST_COMPONENTS} IS_RUNTIME_DEPENDENCY # libarrow.so doesn't depend on libboost*. FALSE) @@ -1083,6 +1133,7 @@ endif() macro(find_curl) if(NOT TARGET CURL::libcurl) find_package(CURL REQUIRED) + list(APPEND ARROW_SYSTEM_DEPENDENCIES CURL) if(NOT TARGET CURL::libcurl) # For CMake 3.11 or older add_library(CURL::libcurl UNKNOWN IMPORTED) @@ -1106,14 +1157,11 @@ macro(build_snappy) ) set(SNAPPY_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} - -DCMAKE_INSTALL_LIBDIR=lib - -DSNAPPY_BUILD_TESTS=OFF - -DSNAPPY_BUILD_BENCHMARKS=OFF + ${EP_COMMON_CMAKE_ARGS} -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") externalproject_add(snappy_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} BUILD_IN_SOURCE 1 INSTALL_DIR ${SNAPPY_PREFIX} URL ${SNAPPY_SOURCE_URL} @@ -1149,6 +1197,9 @@ if(ARROW_WITH_SNAPPY) if(NOT SNAPPY_LIB) get_target_property(SNAPPY_LIB ${Snappy_TARGET} IMPORTED_LOCATION_RELEASE) endif() + if(NOT SNAPPY_LIB) + get_target_property(SNAPPY_LIB ${Snappy_TARGET} IMPORTED_LOCATION_NOCONFIG) + endif() if(NOT SNAPPY_LIB) get_target_property(SNAPPY_LIB ${Snappy_TARGET} IMPORTED_LOCATION) endif() @@ -1164,27 +1215,25 @@ macro(build_brotli) message(STATUS "Building brotli from source") set(BROTLI_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/brotli_ep/src/brotli_ep-install") set(BROTLI_INCLUDE_DIR "${BROTLI_PREFIX}/include") - set(BROTLI_LIB_DIR lib) set(BROTLI_STATIC_LIBRARY_ENC - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) set(BROTLI_STATIC_LIBRARY_DEC - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) set(BROTLI_STATIC_LIBRARY_COMMON - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) - set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}" - -DCMAKE_INSTALL_LIBDIR=${BROTLI_LIB_DIR}) + set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}") externalproject_add(brotli_ep + ${EP_COMMON_OPTIONS} URL ${BROTLI_SOURCE_URL} URL_HASH "SHA256=${ARROW_BROTLI_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS "${BROTLI_STATIC_LIBRARY_ENC}" "${BROTLI_STATIC_LIBRARY_DEC}" "${BROTLI_STATIC_LIBRARY_COMMON}" ${BROTLI_BUILD_BYPRODUCTS} - ${EP_LOG_OPTIONS} CMAKE_ARGS ${BROTLI_CMAKE_ARGS} STEP_TARGETS headers_copy) @@ -1217,59 +1266,35 @@ macro(build_brotli) endmacro() if(ARROW_WITH_BROTLI) - resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) + resolve_dependency(Brotli + HAVE_ALT + TRUE + PC_PACKAGE_NAMES + libbrotlidec + libbrotlienc) endif() if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) set(PARQUET_REQUIRE_ENCRYPTION OFF) endif() set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") -if(BREW_BIN AND NOT OPENSSL_ROOT_DIR) - execute_process(COMMAND ${BREW_BIN} --prefix "openssl@1.1" - OUTPUT_VARIABLE OPENSSL11_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OPENSSL11_BREW_PREFIX) - set(OPENSSL_ROOT_DIR ${OPENSSL11_BREW_PREFIX}) - else() - execute_process(COMMAND ${BREW_BIN} --prefix "openssl" - OUTPUT_VARIABLE OPENSSL_BREW_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OPENSSL_BREW_PREFIX) - set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX}) - endif() - endif() -endif() - set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT - OR ARROW_S3) - # OpenSSL is required - if(ARROW_OPENSSL_USE_SHARED) - # Find shared OpenSSL libraries. - set(OpenSSL_USE_STATIC_LIBS OFF) - # Seems that different envs capitalize this differently? - set(OPENSSL_USE_STATIC_LIBS OFF) - set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS ON) - - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) - set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) - unset(BUILD_SHARED_LIBS_KEEP) - else() - # Find static OpenSSL headers and libs - set(OpenSSL_USE_STATIC_LIBS ON) - set(OPENSSL_USE_STATIC_LIBS ON) - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) - endif() + OR ARROW_S3 + OR ARROW_GANDIVA) + set(OpenSSL_SOURCE "SYSTEM") + resolve_dependency(OpenSSL + HAVE_ALT + TRUE + REQUIRED_VERSION + ${ARROW_OPENSSL_REQUIRED_VERSION}) set(ARROW_USE_OPENSSL ON) endif() if(ARROW_USE_OPENSSL) message(STATUS "Found OpenSSL Crypto Library: ${OPENSSL_CRYPTO_LIBRARY}") message(STATUS "Building with OpenSSL (Version: ${OPENSSL_VERSION}) support") - - list(APPEND ARROW_SYSTEM_DEPENDENCIES OpenSSL) else() message(STATUS "Building without OpenSSL support. Minimum OpenSSL version ${ARROW_OPENSSL_REQUIRED_VERSION} required." ) @@ -1290,33 +1315,31 @@ macro(build_glog) set(GLOG_STATIC_LIB "${GLOG_BUILD_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}glog${GLOG_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) - set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") - set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} -fPIC") + set(GLOG_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS}") + set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS}") if(CMAKE_THREAD_LIBS_INIT) - set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") - set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") + string(APPEND GLOG_CMAKE_CXX_FLAGS " ${CMAKE_THREAD_LIBS_INIT}") + string(APPEND GLOG_CMAKE_C_FLAGS " ${CMAKE_THREAD_LIBS_INIT}") endif() if(APPLE) # If we don't set this flag, the binary built with 10.13 cannot be used in 10.12. - set(GLOG_CMAKE_CXX_FLAGS "${GLOG_CMAKE_CXX_FLAGS} -mmacosx-version-min=10.9") + string(APPEND GLOG_CMAKE_CXX_FLAGS " -mmacosx-version-min=10.9") endif() set(GLOG_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${GLOG_BUILD_DIR}" - -DBUILD_SHARED_LIBS=OFF - -DBUILD_TESTING=OFF -DWITH_GFLAGS=OFF - -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${GLOG_CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${GLOG_CMAKE_C_FLAGS} - -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}) + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${GLOG_CMAKE_C_FLAGS}) externalproject_add(glog_ep + ${EP_COMMON_OPTIONS} URL ${GLOG_SOURCE_URL} URL_HASH "SHA256=${ARROW_GLOG_BUILD_SHA256_CHECKSUM}" BUILD_IN_SOURCE 1 BUILD_BYPRODUCTS "${GLOG_STATIC_LIB}" - CMAKE_ARGS ${GLOG_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + CMAKE_ARGS ${GLOG_CMAKE_ARGS}) add_dependencies(toolchain glog_ep) file(MAKE_DIRECTORY "${GLOG_INCLUDE_DIR}") @@ -1366,16 +1389,14 @@ macro(build_gflags) set(GFLAGS_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${GFLAGS_PREFIX}" - -DBUILD_SHARED_LIBS=OFF - -DBUILD_STATIC_LIBS=ON -DBUILD_PACKAGING=OFF - -DBUILD_TESTING=OFF -DBUILD_CONFIG_TESTS=OFF -DINSTALL_HEADERS=ON) file(MAKE_DIRECTORY "${GFLAGS_INCLUDE_DIR}") externalproject_add(gflags_ep - URL ${GFLAGS_SOURCE_URL} ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} + URL ${GFLAGS_SOURCE_URL} URL_HASH "SHA256=${ARROW_GFLAGS_BUILD_SHA256_CHECKSUM}" BUILD_IN_SOURCE 1 BUILD_BYPRODUCTS "${GFLAGS_STATIC_LIB}" @@ -1440,8 +1461,6 @@ macro(build_thrift) -DBoost_NO_BOOST_CMAKE=ON -DBUILD_COMPILER=OFF -DBUILD_EXAMPLES=OFF - -DBUILD_SHARED_LIBS=OFF - -DBUILD_TESTING=OFF -DBUILD_TUTORIALS=OFF -DCMAKE_DEBUG_POSTFIX= -DWITH_AS3=OFF @@ -1485,11 +1504,12 @@ macro(build_thrift) endif() externalproject_add(thrift_ep + ${EP_COMMON_OPTIONS} URL ${THRIFT_SOURCE_URL} URL_HASH "SHA256=${ARROW_THRIFT_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS "${THRIFT_LIB}" CMAKE_ARGS ${THRIFT_CMAKE_ARGS} - DEPENDS ${THRIFT_DEPENDENCIES} ${EP_LOG_OPTIONS}) + DEPENDS ${THRIFT_DEPENDENCIES}) add_library(thrift::thrift STATIC IMPORTED) # The include directory must exist before it is referenced by a target. @@ -1508,13 +1528,16 @@ macro(build_thrift) add_dependencies(toolchain thrift_ep) add_dependencies(thrift::thrift thrift_ep) set(Thrift_VERSION ${ARROW_THRIFT_BUILD_VERSION}) + set(THRIFT_VENDORED TRUE) list(APPEND ARROW_BUNDLED_STATIC_LIBS thrift::thrift) endmacro() if(ARROW_WITH_THRIFT) - # Thrift c++ code generated by 0.13 requires 0.11 or greater + # Thrift C++ code generated by 0.13 requires 0.11 or greater resolve_dependency(Thrift + HAVE_ALT + TRUE REQUIRED_VERSION 0.11.0 PC_PACKAGE_NAMES @@ -1578,15 +1601,11 @@ macro(build_protobuf) string(REPLACE "-ffat-lto-objects" "" PROTOBUF_CXX_FLAGS "${PROTOBUF_CXX_FLAGS}") set(PROTOBUF_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_INSTALL_LIBDIR=lib + "-DCMAKE_CXX_FLAGS=${PROTOBUF_CXX_FLAGS}" + "-DCMAKE_C_FLAGS=${PROTOBUF_C_FLAGS}" "-DCMAKE_INSTALL_PREFIX=${PROTOBUF_PREFIX}" -Dprotobuf_BUILD_TESTS=OFF - -Dprotobuf_DEBUG_POSTFIX= - "-DCMAKE_C_FLAGS=${PROTOBUF_C_FLAGS}" - "-DCMAKE_CXX_FLAGS=${PROTOBUF_CXX_FLAGS}" - "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${PROTOBUF_C_FLAGS}" - "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${PROTOBUF_CXX_FLAGS}") + -Dprotobuf_DEBUG_POSTFIX=) if(MSVC AND NOT ARROW_USE_STATIC_CRT) list(APPEND PROTOBUF_CMAKE_ARGS "-Dprotobuf_MSVC_STATIC_RUNTIME=OFF") endif() @@ -1598,9 +1617,8 @@ macro(build_protobuf) endif() externalproject_add(protobuf_ep - ${PROTOBUF_EXTERNAL_PROJECT_ADD_ARGS} + ${EP_COMMON_OPTIONS} ${PROTOBUF_EXTERNAL_PROJECT_ADD_ARGS} BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOBUF_COMPILER}" - ${EP_LOG_OPTIONS} BUILD_IN_SOURCE 1 URL ${PROTOBUF_SOURCE_URL} URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}") @@ -1634,10 +1652,6 @@ if(ARROW_WITH_PROTOBUF) if(ARROW_WITH_GRPC) # FlightSQL uses proto3 optionals, which require 3.15 or later. set(ARROW_PROTOBUF_REQUIRED_VERSION "3.15.0") - elseif(ARROW_GANDIVA_JAVA) - # google::protobuf::MessageLite::ByteSize() is deprecated since - # Protobuf 3.4.0. - set(ARROW_PROTOBUF_REQUIRED_VERSION "3.4.0") elseif(ARROW_SUBSTRAIT) # Substrait protobuf files use proto3 syntax set(ARROW_PROTOBUF_REQUIRED_VERSION "3.0.0") @@ -1645,6 +1659,8 @@ if(ARROW_WITH_PROTOBUF) set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") endif() resolve_dependency(Protobuf + HAVE_ALT + TRUE REQUIRED_VERSION ${ARROW_PROTOBUF_REQUIRED_VERSION} PC_PACKAGE_NAMES @@ -1722,8 +1738,11 @@ macro(build_substrait) # Note: not all protos in Substrait actually matter to plan # consumption. No need to build the ones we don't need. set(SUBSTRAIT_PROTOS algebra extensions/extensions plan type) + set(ARROW_SUBSTRAIT_PROTOS extension_rels) + set(ARROW_SUBSTRAIT_PROTOS_DIR "${CMAKE_SOURCE_DIR}/proto") externalproject_add(substrait_ep + ${EP_COMMON_OPTIONS} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" @@ -1772,6 +1791,27 @@ macro(build_substrait) list(APPEND SUBSTRAIT_SOURCES "${SUBSTRAIT_PROTO_GEN}.cc") endforeach() + foreach(ARROW_SUBSTRAIT_PROTO ${ARROW_SUBSTRAIT_PROTOS}) + set(ARROW_SUBSTRAIT_PROTO_GEN + "${SUBSTRAIT_CPP_DIR}/substrait/${ARROW_SUBSTRAIT_PROTO}.pb") + foreach(EXT h cc) + set_source_files_properties("${ARROW_SUBSTRAIT_PROTO_GEN}.${EXT}" + PROPERTIES COMPILE_OPTIONS + "${SUBSTRAIT_SUPPRESSED_FLAGS}" + GENERATED TRUE + SKIP_UNITY_BUILD_INCLUSION TRUE) + list(APPEND SUBSTRAIT_PROTO_GEN_ALL "${ARROW_SUBSTRAIT_PROTO_GEN}.${EXT}") + endforeach() + add_custom_command(OUTPUT "${ARROW_SUBSTRAIT_PROTO_GEN}.cc" + "${ARROW_SUBSTRAIT_PROTO_GEN}.h" + COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${SUBSTRAIT_LOCAL_DIR}/proto" + "-I${ARROW_SUBSTRAIT_PROTOS_DIR}" + "--cpp_out=${SUBSTRAIT_CPP_DIR}" + "${ARROW_SUBSTRAIT_PROTOS_DIR}/substrait/${ARROW_SUBSTRAIT_PROTO}.proto" + DEPENDS ${PROTO_DEPENDS} substrait_ep) + + list(APPEND SUBSTRAIT_SOURCES "${ARROW_SUBSTRAIT_PROTO_GEN}.cc") + endforeach() add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL}) @@ -1840,6 +1880,7 @@ macro(build_jemalloc) list(APPEND JEMALLOC_BUILD_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}") endif() externalproject_add(jemalloc_ep + ${EP_COMMON_OPTIONS} URL ${JEMALLOC_SOURCE_URL} URL_HASH "SHA256=${ARROW_JEMALLOC_BUILD_SHA256_CHECKSUM}" PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html @@ -1855,15 +1896,15 @@ macro(build_jemalloc) set(JEMALLOC_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src/") # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${JEMALLOC_INCLUDE_DIR}") - add_library(jemalloc STATIC IMPORTED) - set_target_properties(jemalloc + add_library(jemalloc::jemalloc STATIC IMPORTED) + set_target_properties(jemalloc::jemalloc PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}") - add_dependencies(jemalloc jemalloc_ep) + add_dependencies(jemalloc::jemalloc jemalloc_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc) + list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) set(jemalloc_VENDORED TRUE) # For config.h.cmake @@ -1871,7 +1912,7 @@ macro(build_jemalloc) endmacro() if(ARROW_JEMALLOC) - resolve_dependency(jemalloc) + resolve_dependency(jemalloc HAVE_ALT TRUE) endif() # ---------------------------------------------------------------------- @@ -1895,11 +1936,9 @@ if(ARROW_MIMALLOC) "${MIMALLOC_PREFIX}/lib/mimalloc-2.0/${CMAKE_STATIC_LIBRARY_PREFIX}${MIMALLOC_LIB_BASE_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) - # Override CMAKE_INSTALL_LIBDIR to avoid lib64 installation on RedHat derivatives set(MIMALLOC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${MIMALLOC_PREFIX}" - "-DCMAKE_INSTALL_LIBDIR=lib" -DMI_OVERRIDE=OFF -DMI_LOCAL_DYNAMIC_TLS=ON -DMI_BUILD_OBJECT=OFF @@ -1907,6 +1946,7 @@ if(ARROW_MIMALLOC) -DMI_BUILD_TESTS=OFF) externalproject_add(mimalloc_ep + ${EP_COMMON_OPTIONS} URL ${MIMALLOC_SOURCE_URL} URL_HASH "SHA256=${ARROW_MIMALLOC_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${MIMALLOC_CMAKE_ARGS} @@ -1948,12 +1988,15 @@ macro(build_gtest) endif() if(APPLE) - set(GTEST_CMAKE_CXX_FLAGS ${GTEST_CMAKE_CXX_FLAGS} -DGTEST_USE_OWN_TR1_TUPLE=1 - -Wno-unused-value -Wno-ignored-attributes) + string(APPEND + GTEST_CMAKE_CXX_FLAGS + " -DGTEST_USE_OWN_TR1_TUPLE=1" + " -Wno-unused-value" + " -Wno-ignored-attributes") endif() - if(MSVC) - set(GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS} -DGTEST_CREATE_SHARED_LIBRARY=1") + if(WIN32) + string(APPEND GTEST_CMAKE_CXX_FLAGS " -DGTEST_CREATE_SHARED_LIBRARY=1") endif() set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix") @@ -1961,7 +2004,7 @@ macro(build_gtest) set(_GTEST_LIBRARY_DIR "${GTEST_PREFIX}/lib") - if(MSVC) + if(WIN32) set(_GTEST_IMPORTED_TYPE IMPORTED_IMPLIB) set(_GTEST_LIBRARY_SUFFIX "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_IMPORT_LIBRARY_SUFFIX}") @@ -1984,27 +2027,26 @@ macro(build_gtest) set(dummy ">") set(GTEST_CMAKE_ARGS - ${EP_COMMON_TOOLCHAIN} + ${EP_COMMON_CMAKE_ARGS} -DBUILD_SHARED_LIBS=ON - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DBUILD_STATIC_LIBS=OFF -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${GTEST_CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_INSTALL_NAME_DIR=${GTEST_INSTALL_NAME_DIR} -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} -DCMAKE_MACOSX_RPATH=OFF) set(GMOCK_INCLUDE_DIR "${GTEST_PREFIX}/include") - if(MSVC AND NOT ARROW_USE_STATIC_CRT) - set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) + if(WIN32 AND NOT ARROW_USE_STATIC_CRT) + list(APPEND GTEST_CMAKE_ARGS -Dgtest_force_shared_crt=ON) endif() externalproject_add(googletest_ep + ${EP_COMMON_OPTIONS} URL ${GTEST_SOURCE_URL} URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS ${GTEST_SHARED_LIB} ${GTEST_MAIN_SHARED_LIB} ${GMOCK_SHARED_LIB} - CMAKE_ARGS ${GTEST_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + CMAKE_ARGS ${GTEST_CMAKE_ARGS}) if(WIN32) # Copy the built shared libraries to the same directory as our # test programs because Windows doesn't provided rpath (run-time @@ -2096,13 +2138,10 @@ macro(build_benchmark) message(FATAL_ERROR "Building gbenchmark from source requires at least CMake 3.6") endif() - if(NOT MSVC) - set(GBENCHMARK_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} -std=c++11") - endif() - + set(GBENCHMARK_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS}") if(APPLE AND (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")) - set(GBENCHMARK_CMAKE_CXX_FLAGS "${GBENCHMARK_CMAKE_CXX_FLAGS} -stdlib=libc++") + string(APPEND GBENCHMARK_CMAKE_CXX_FLAGS " -stdlib=libc++") endif() set(GBENCHMARK_PREFIX @@ -2115,21 +2154,19 @@ macro(build_benchmark) "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" ) set(GBENCHMARK_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} - "-DCMAKE_INSTALL_PREFIX=${GBENCHMARK_PREFIX}" - -DCMAKE_INSTALL_LIBDIR=lib - -DBENCHMARK_ENABLE_TESTING=OFF - -DCMAKE_CXX_FLAGS=${GBENCHMARK_CMAKE_CXX_FLAGS}) + ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${GBENCHMARK_PREFIX}" + -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_CXX_FLAGS=${GBENCHMARK_CMAKE_CXX_FLAGS}) if(APPLE) set(GBENCHMARK_CMAKE_ARGS ${GBENCHMARK_CMAKE_ARGS} "-DBENCHMARK_USE_LIBCXX=ON") endif() externalproject_add(gbenchmark_ep + ${EP_COMMON_OPTIONS} URL ${GBENCHMARK_SOURCE_URL} URL_HASH "SHA256=${ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS "${GBENCHMARK_STATIC_LIB}" "${GBENCHMARK_MAIN_STATIC_LIB}" - CMAKE_ARGS ${GBENCHMARK_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + CMAKE_ARGS ${GBENCHMARK_CMAKE_ARGS}) # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${GBENCHMARK_INCLUDE_DIR}") @@ -2172,7 +2209,7 @@ macro(build_rapidjson) "-DCMAKE_INSTALL_PREFIX=${RAPIDJSON_PREFIX}") externalproject_add(rapidjson_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} PREFIX "${CMAKE_BINARY_DIR}" URL ${RAPIDJSON_SOURCE_URL} URL_HASH "SHA256=${ARROW_RAPIDJSON_BUILD_SHA256_CHECKSUM}" @@ -2224,7 +2261,7 @@ macro(build_xsimd) set(XSIMD_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${XSIMD_PREFIX}") externalproject_add(xsimd_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} PREFIX "${CMAKE_BINARY_DIR}" URL ${XSIMD_SOURCE_URL} URL_HASH "SHA256=${ARROW_XSIMD_BUILD_SHA256_CHECKSUM}" @@ -2248,7 +2285,11 @@ else() endif() if(ARROW_USE_XSIMD) - resolve_dependency(xsimd REQUIRED_VERSION "8.1.0") + resolve_dependency(xsimd + REQUIRED_VERSION + "8.1.0" + FORCE_ANY_NEWER_VERSION + TRUE) if(xsimd_SOURCE STREQUAL "BUNDLED") add_library(xsimd INTERFACE IMPORTED) @@ -2276,11 +2317,11 @@ macro(build_zlib) set(ZLIB_STATIC_LIB_NAME libz.a) endif() set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") - set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}" - -DBUILD_SHARED_LIBS=OFF) + set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") externalproject_add(zlib_ep - URL ${ZLIB_SOURCE_URL} ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} + URL ${ZLIB_SOURCE_URL} URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) @@ -2306,53 +2347,38 @@ if(ARROW_WITH_ZLIB) endif() macro(build_lz4) - message(STATUS "Building lz4 from source") - set(LZ4_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-prefix/src/lz4_ep") - set(LZ4_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-prefix") - - if(MSVC) - if(ARROW_USE_STATIC_CRT) - if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(LZ4_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreadedDebug") - else() - set(LZ4_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreaded") - endif() - endif() - set(LZ4_STATIC_LIB - "${LZ4_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") - set(LZ4_BUILD_COMMAND - BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 - /p:PlatformToolset=v140 ${LZ4_RUNTIME_LIBRARY_LINKAGE} /t:Build - ${LZ4_BUILD_DIR}/build/VS2010/lz4.sln) - else() - set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") - # Must explicitly invoke sh on MinGW - set(LZ4_BUILD_COMMAND - BUILD_COMMAND sh "${CMAKE_CURRENT_SOURCE_DIR}/build-support/build-lz4-lib.sh" - "AR=${CMAKE_AR}" "OS=${CMAKE_SYSTEM_NAME}") + message(STATUS "Building LZ4 from source") + if(CMAKE_VERSION VERSION_LESS 3.7) + message(FATAL_ERROR "Building LZ4 using ExternalProject requires at least CMake 3.7") endif() + set(LZ4_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-install") + + set(LZ4_STATIC_LIB + "${LZ4_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + + set(LZ4_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX= + -DLZ4_BUILD_CLI=OFF -DLZ4_BUILD_LEGACY_LZ4C=OFF) + # We need to copy the header in lib to directory outside of the build externalproject_add(lz4_ep - URL ${LZ4_SOURCE_URL} ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} + CMAKE_ARGS ${LZ4_CMAKE_ARGS} + SOURCE_SUBDIR "build/cmake" + INSTALL_DIR ${LZ4_PREFIX} + URL ${LZ4_SOURCE_URL} URL_HASH "SHA256=${ARROW_LZ4_BUILD_SHA256_CHECKSUM}" - UPDATE_COMMAND ${CMAKE_COMMAND} -E copy_directory - "${LZ4_BUILD_DIR}/lib" "${LZ4_PREFIX}/include" - ${LZ4_PATCH_COMMAND} - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BINARY_DIR ${LZ4_BUILD_DIR} - BUILD_BYPRODUCTS ${LZ4_STATIC_LIB} ${LZ4_BUILD_COMMAND}) + BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) file(MAKE_DIRECTORY "${LZ4_PREFIX}/include") - add_library(lz4::lz4 STATIC IMPORTED) - set_target_properties(lz4::lz4 + add_library(LZ4::lz4 STATIC IMPORTED) + set_target_properties(LZ4::lz4 PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${LZ4_PREFIX}/include") add_dependencies(toolchain lz4_ep) - add_dependencies(lz4::lz4 lz4_ep) + add_dependencies(LZ4::lz4 lz4_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS lz4::lz4) + list(APPEND ARROW_BUNDLED_STATIC_LIBS LZ4::lz4) endmacro() if(ARROW_WITH_LZ4) @@ -2364,42 +2390,33 @@ if(ARROW_WITH_LZ4) endif() macro(build_zstd) - message(STATUS "Building zstd from source") + message(STATUS "Building Zstandard from source") + if(CMAKE_VERSION VERSION_LESS 3.7) + message(FATAL_ERROR "Building Zstandard using ExternalProject requires at least CMake 3.7" + ) + endif() + set(ZSTD_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zstd_ep-install") set(ZSTD_CMAKE_ARGS - ${EP_COMMON_TOOLCHAIN} + ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZSTD_PREFIX}" - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} - -DZSTD_BUILD_PROGRAMS=off - -DZSTD_BUILD_SHARED=off - -DZSTD_BUILD_STATIC=on - -DZSTD_MULTITHREAD_SUPPORT=off) + -DZSTD_BUILD_PROGRAMS=OFF + -DZSTD_BUILD_SHARED=OFF + -DZSTD_BUILD_STATIC=ON + -DZSTD_MULTITHREAD_SUPPORT=OFF) if(MSVC) - set(ZSTD_STATIC_LIB "${ZSTD_PREFIX}/${CMAKE_INSTALL_LIBDIR}/zstd_static.lib") + set(ZSTD_STATIC_LIB "${ZSTD_PREFIX}/lib/zstd_static.lib") if(ARROW_USE_STATIC_CRT) - set(ZSTD_CMAKE_ARGS ${ZSTD_CMAKE_ARGS} "-DZSTD_USE_STATIC_RUNTIME=on") + list(APPEND ZSTD_CMAKE_ARGS "-DZSTD_USE_STATIC_RUNTIME=ON") endif() else() - set(ZSTD_STATIC_LIB "${ZSTD_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libzstd.a") - # Only pass our C flags on Unix as on MSVC it leads to a - # "incompatible command-line options" error - set(ZSTD_CMAKE_ARGS - ${ZSTD_CMAKE_ARGS} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_FLAGS=${EP_C_FLAGS} - -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}) - endif() - - if(CMAKE_VERSION VERSION_LESS 3.7) - message(FATAL_ERROR "Building zstd using ExternalProject requires at least CMake 3.7") + set(ZSTD_STATIC_LIB "${ZSTD_PREFIX}/lib/libzstd.a") endif() externalproject_add(zstd_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} CMAKE_ARGS ${ZSTD_CMAKE_ARGS} SOURCE_SUBDIR "build/cmake" INSTALL_DIR ${ZSTD_PREFIX} @@ -2409,39 +2426,41 @@ macro(build_zstd) file(MAKE_DIRECTORY "${ZSTD_PREFIX}/include") - add_library(zstd::libzstd STATIC IMPORTED) - set_target_properties(zstd::libzstd + add_library(zstd::libzstd_static STATIC IMPORTED) + set_target_properties(zstd::libzstd_static PROPERTIES IMPORTED_LOCATION "${ZSTD_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_PREFIX}/include") add_dependencies(toolchain zstd_ep) - add_dependencies(zstd::libzstd zstd_ep) + add_dependencies(zstd::libzstd_static zstd_ep) + + list(APPEND ARROW_BUNDLED_STATIC_LIBS zstd::libzstd_static) - list(APPEND ARROW_BUNDLED_STATIC_LIBS zstd::libzstd) + set(ZSTD_VENDORED TRUE) endmacro() if(ARROW_WITH_ZSTD) # ARROW-13384: ZSTD_minCLevel was added in v1.4.0, required by ARROW-13091 resolve_dependency(zstd + HAVE_ALT + TRUE PC_PACKAGE_NAMES libzstd REQUIRED_VERSION 1.4.0) - if(TARGET zstd::libzstd) - set(ARROW_ZSTD_LIBZSTD zstd::libzstd) + if(ZSTD_VENDORED) + set(ARROW_ZSTD_LIBZSTD zstd::libzstd_static) else() - # "SYSTEM" source will prioritize cmake config, which exports - # zstd::libzstd_{static,shared} if(ARROW_ZSTD_USE_SHARED) - if(TARGET zstd::libzstd_shared) - set(ARROW_ZSTD_LIBZSTD zstd::libzstd_shared) - endif() + set(ARROW_ZSTD_LIBZSTD zstd::libzstd_shared) else() - if(TARGET zstd::libzstd_static) - set(ARROW_ZSTD_LIBZSTD zstd::libzstd_static) - endif() + set(ARROW_ZSTD_LIBZSTD zstd::libzstd_static) endif() + if(NOT TARGET ${ARROW_ZSTD_LIBZSTD}) + message(FATAL_ERROR "Zstandard target doesn't exist: ${ARROW_ZSTD_LIBZSTD}") + endif() + message(STATUS "Found Zstandard: ${ARROW_ZSTD_LIBZSTD}") endif() endif() @@ -2454,11 +2473,10 @@ macro(build_re2) set(RE2_STATIC_LIB "${RE2_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}re2${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(RE2_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${RE2_PREFIX}" - -DCMAKE_INSTALL_LIBDIR=lib) + set(RE2_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${RE2_PREFIX}") externalproject_add(re2_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} INSTALL_DIR ${RE2_PREFIX} URL ${RE2_SOURCE_URL} URL_HASH "SHA256=${ARROW_RE2_BUILD_SHA256_CHECKSUM}" @@ -2524,7 +2542,7 @@ macro(build_bzip2) endif() externalproject_add(bzip2_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS} @@ -2575,15 +2593,11 @@ macro(build_utf8proc) ) endif() - set(UTF8PROC_CMAKE_ARGS - ${EP_COMMON_TOOLCHAIN} - "-DCMAKE_INSTALL_PREFIX=${UTF8PROC_PREFIX}" - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF) + set(UTF8PROC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_INSTALL_PREFIX=${UTF8PROC_PREFIX}") externalproject_add(utf8proc_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} CMAKE_ARGS ${UTF8PROC_CMAKE_ARGS} INSTALL_DIR ${UTF8PROC_PREFIX} URL ${ARROW_UTF8PROC_SOURCE_URL} @@ -2624,15 +2638,11 @@ macro(build_cares) "${CARES_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares${CMAKE_STATIC_LIBRARY_SUFFIX}" ) - set(CARES_CMAKE_ARGS - "${EP_COMMON_CMAKE_ARGS}" - -DCARES_STATIC=ON - -DCARES_SHARED=OFF - -DCMAKE_INSTALL_LIBDIR=lib - "-DCMAKE_INSTALL_PREFIX=${CARES_PREFIX}") + set(CARES_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" "-DCMAKE_INSTALL_PREFIX=${CARES_PREFIX}" + -DCARES_SHARED=OFF -DCARES_STATIC=ON) externalproject_add(cares_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${CARES_SOURCE_URL} URL_HASH "SHA256=${ARROW_CARES_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${CARES_CMAKE_ARGS} @@ -2696,7 +2706,7 @@ macro(build_absl) set(ABSL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/absl_ep-install") set(ABSL_INCLUDE_DIR "${ABSL_PREFIX}/include") set(ABSL_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" -DABSL_RUN_TESTS=OFF - -DCMAKE_INSTALL_LIBDIR=lib "-DCMAKE_INSTALL_PREFIX=${ABSL_PREFIX}") + "-DCMAKE_INSTALL_PREFIX=${ABSL_PREFIX}") set(ABSL_BUILD_BYPRODUCTS) set(ABSL_LIBRARIES) @@ -3612,7 +3622,7 @@ macro(build_absl) endif() externalproject_add(absl_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${ABSL_SOURCE_URL} URL_HASH "SHA256=${ARROW_ABSL_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${ABSL_CMAKE_ARGS} @@ -3710,27 +3720,29 @@ macro(build_grpc) endif() # Yuck, see https://stackoverflow.com/a/45433229/776560 - string(REPLACE ";" "|" GRPC_PREFIX_PATH_ALT_SEP "${GRPC_CMAKE_PREFIX}") + string(REPLACE ";" ${EP_LIST_SEPARATOR} GRPC_PREFIX_PATH_ALT_SEP "${GRPC_CMAKE_PREFIX}") set(GRPC_C_FLAGS "${EP_C_FLAGS}") set(GRPC_CXX_FLAGS "${EP_CXX_FLAGS}") if(NOT MSVC) # Negate warnings that gRPC cannot build under # See https://github.com/grpc/grpc/issues/29417 - set(GRPC_C_FLAGS - "${GRPC_C_FLAGS} -Wno-attributes -Wno-format-security -Wno-unknown-warning-option" - ) - set(GRPC_CXX_FLAGS - "${GRPC_CXX_FLAGS} -Wno-attributes -Wno-format-security -Wno-unknown-warning-option" - ) + string(APPEND + GRPC_C_FLAGS + " -Wno-attributes" + " -Wno-format-security" + " -Wno-unknown-warning-option") + string(APPEND + GRPC_CXX_FLAGS + " -Wno-attributes" + " -Wno-format-security" + " -Wno-unknown-warning-option") endif() set(GRPC_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" "-DCMAKE_C_FLAGS=${GRPC_C_FLAGS}" "-DCMAKE_CXX_FLAGS=${GRPC_CXX_FLAGS}" - "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${GRPC_C_FLAGS}" - "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${GRPC_CXX_FLAGS}" -DCMAKE_PREFIX_PATH='${GRPC_PREFIX_PATH_ALT_SEP}' -DgRPC_ABSL_PROVIDER=package -DgRPC_BUILD_CSHARP_EXT=OFF @@ -3748,9 +3760,7 @@ macro(build_grpc) -DgRPC_RE2_PROVIDER=package -DgRPC_SSL_PROVIDER=package -DgRPC_ZLIB_PROVIDER=package - -DCMAKE_INSTALL_PREFIX=${GRPC_PREFIX} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF) + -DCMAKE_INSTALL_PREFIX=${GRPC_PREFIX}) if(PROTOBUF_VENDORED) list(APPEND GRPC_CMAKE_ARGS -DgRPC_PROTOBUF_PACKAGE_TYPE=CONFIG) endif() @@ -3762,9 +3772,9 @@ macro(build_grpc) # Ideally, we should be able to use the tarballs, but they don't contain # vendored dependencies such as c-ares... externalproject_add(grpc_ep + ${EP_COMMON_OPTIONS} URL ${GRPC_SOURCE_URL} URL_HASH "SHA256=${ARROW_GRPC_BUILD_SHA256_CHECKSUM}" - LIST_SEPARATOR | BUILD_BYPRODUCTS ${GRPC_STATIC_LIBRARY_GPR} ${GRPC_STATIC_LIBRARY_GRPC} ${GRPC_STATIC_LIBRARY_GRPCPP} @@ -3772,7 +3782,7 @@ macro(build_grpc) ${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION} ${GRPC_STATIC_LIBRARY_UPB} ${GRPC_CPP_PLUGIN} - CMAKE_ARGS ${GRPC_CMAKE_ARGS} ${EP_LOG_OPTIONS} + CMAKE_ARGS ${GRPC_CMAKE_ARGS} DEPENDS ${grpc_dependencies}) # Work around https://gitlab.kitware.com/cmake/cmake/issues/15052 @@ -3917,7 +3927,7 @@ macro(build_grpc) gRPC::grpc gRPC::grpcpp_for_bundling gRPC::upb) - if(ABS_VENDORED) + if(ABSL_VENDORED) list(APPEND ARROW_BUNDLED_STATIC_LIBS ${GRPC_GPR_ABSL_LIBRARIES}) endif() endmacro() @@ -3968,7 +3978,6 @@ macro(build_crc32c_once) set(CRC32C_INCLUDE_DIR "${CRC32C_PREFIX}/include") set(CRC32C_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} - -DCMAKE_INSTALL_LIBDIR=lib "-DCMAKE_INSTALL_PREFIX=" -DCRC32C_BUILD_TESTS=OFF -DCRC32C_BUILD_BENCHMARKS=OFF @@ -3981,7 +3990,7 @@ macro(build_crc32c_once) set(CRC32C_LIBRARIES crc32c) externalproject_add(crc32c_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} INSTALL_DIR ${CRC32C_PREFIX} URL ${CRC32C_SOURCE_URL} URL_HASH "SHA256=${ARROW_CRC32C_BUILD_SHA256_CHECKSUM}" @@ -4004,13 +4013,13 @@ macro(build_nlohmann_json) set(NLOHMANN_JSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json_ep-install") set(NLOHMANN_JSON_INCLUDE_DIR "${NLOHMANN_JSON_PREFIX}/include") set(NLOHMANN_JSON_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=" -DBUILD_TESTING=OFF + ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=" -DJSON_BuildTests=OFF) set(NLOHMANN_JSON_BUILD_BYPRODUCTS ${NLOHMANN_JSON_PREFIX}/include/nlohmann/json.hpp) externalproject_add(nlohmann_json_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} INSTALL_DIR ${NLOHMANN_JSON_PREFIX} URL ${NLOHMANN_JSON_SOURCE_URL} URL_HASH "SHA256=${ARROW_NLOHMANN_JSON_BUILD_SHA256_CHECKSUM}" @@ -4044,7 +4053,10 @@ macro(build_google_cloud_cpp_storage) # Curl is required on all platforms, but building it internally might also trip over S3's copy. # For now, force its inclusion from the underlying system or fail. find_curl() - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) + if(NOT OpenSSL_FOUND) + resolve_dependency(OpenSSL HAVE_ALT REQUIRED_VERSION + ${ARROW_OPENSSL_REQUIRED_VERSION}) + endif() # Build google-cloud-cpp, with only storage_client @@ -4058,18 +4070,15 @@ macro(build_google_cloud_cpp_storage) list(APPEND GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST ${CRC32C_PREFIX}) list(APPEND GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST ${NLOHMANN_JSON_PREFIX}) - set(GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST_SEP_CHAR "|") # JOIN is CMake >=3.12 only - string(REPLACE ";" ${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST_SEP_CHAR} - GOOGLE_CLOUD_CPP_PREFIX_PATH "${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST}") + string(REPLACE ";" ${EP_LIST_SEPARATOR} GOOGLE_CLOUD_CPP_PREFIX_PATH + "${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST}") set(GOOGLE_CLOUD_CPP_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/google_cloud_cpp_ep-install") set(GOOGLE_CLOUD_CPP_INCLUDE_DIR "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/include") set(GOOGLE_CLOUD_CPP_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} - -DBUILD_TESTING=OFF - -DCMAKE_INSTALL_LIBDIR=lib "-DCMAKE_INSTALL_PREFIX=" -DCMAKE_INSTALL_RPATH=$ORIGIN -DCMAKE_PREFIX_PATH=${GOOGLE_CLOUD_CPP_PREFIX_PATH} @@ -4119,8 +4128,7 @@ macro(build_google_cloud_cpp_storage) endif() endif() externalproject_add(google_cloud_cpp_ep - ${EP_LOG_OPTIONS} - LIST_SEPARATOR ${GOOGLE_CLOUD_CPP_PREFIX_PATH_LIST_SEP_CHAR} + ${EP_COMMON_OPTIONS} INSTALL_DIR ${GOOGLE_CLOUD_CPP_INSTALL_PREFIX} URL ${google_cloud_cpp_storage_SOURCE_URL} URL_HASH "SHA256=${ARROW_GOOGLE_CLOUD_CPP_BUILD_SHA256_CHECKSUM}" @@ -4229,7 +4237,7 @@ macro(build_google_cloud_cpp_storage) endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) - resolve_dependency(google_cloud_cpp_storage) + resolve_dependency(google_cloud_cpp_storage PC_PACKAGE_NAMES google_cloud_cpp_storage) get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage INTERFACE_INCLUDE_DIRECTORIES) message(STATUS "Found google-cloud-cpp::storage headers: ${google_cloud_cpp_storage_INCLUDE_DIR}" @@ -4275,6 +4283,8 @@ macro(build_orc) set(ORC_STATIC_LIB "${ORC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}orc${CMAKE_STATIC_LIBRARY_SUFFIX}") + get_target_property(ORC_PROTOBUF_EXECUTABLE ${ARROW_PROTOBUF_PROTOC} IMPORTED_LOCATION) + get_target_property(ORC_PROTOBUF_INCLUDE_DIR ${ARROW_PROTOBUF_LIBPROTOBUF} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_PROTOBUF_ROOT "${ORC_PROTOBUF_INCLUDE_DIR}" DIRECTORY) @@ -4286,15 +4296,17 @@ macro(build_orc) INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_SNAPPY_ROOT "${ORC_SNAPPY_INCLUDE_DIR}" DIRECTORY) - get_target_property(ORC_LZ4_ROOT lz4::lz4 INTERFACE_INCLUDE_DIRECTORIES) + get_target_property(ORC_LZ4_ROOT LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_LZ4_ROOT "${ORC_LZ4_ROOT}" DIRECTORY) + get_target_property(ORC_ZSTD_ROOT ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(ORC_ZSTD_ROOT "${ORC_ZSTD_ROOT}" DIRECTORY) + # Weirdly passing in PROTOBUF_LIBRARY for PROTOC_LIBRARY still results in ORC finding # the protoc library. set(ORC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ORC_PREFIX}" - -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} -DSTOP_BUILD_ON_WARNING=OFF -DBUILD_LIBHDFSPP=OFF -DBUILD_JAVA=OFF @@ -4303,12 +4315,13 @@ macro(build_orc) -DINSTALL_VENDORED_LIBS=OFF "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" + "-DPROTOBUF_EXECUTABLE=${ORC_PROTOBUF_EXECUTABLE}" "-DPROTOBUF_HOME=${ORC_PROTOBUF_ROOT}" "-DPROTOBUF_INCLUDE_DIR=${ORC_PROTOBUF_INCLUDE_DIR}" "-DPROTOBUF_LIBRARY=${ORC_PROTOBUF_LIBRARY}" "-DPROTOC_LIBRARY=${ORC_PROTOBUF_LIBRARY}" - "-DLZ4_HOME=${LZ4_HOME}" - "-DZSTD_HOME=${ZSTD_HOME}") + "-DLZ4_HOME=${ORC_LZ4_ROOT}" + "-DZSTD_HOME=${ORZ_ZSTD_ROOT}") if(ORC_PROTOBUF_EXECUTABLE) set(ORC_CMAKE_ARGS ${ORC_CMAKE_ARGS} "-DPROTOBUF_EXECUTABLE:FILEPATH=${ORC_PROTOBUF_EXECUTABLE}") @@ -4321,23 +4334,36 @@ macro(build_orc) file(MAKE_DIRECTORY ${ORC_INCLUDE_DIR}) externalproject_add(orc_ep + ${EP_COMMON_OPTIONS} URL ${ORC_SOURCE_URL} URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS ${ORC_STATIC_LIB} - CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS}) - - add_dependencies(toolchain orc_ep) + CMAKE_ARGS ${ORC_CMAKE_ARGS} + DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} + ${ARROW_ZSTD_LIBZSTD} + ${Snappy_TARGET} + LZ4::lz4 + ZLIB::ZLIB) set(ORC_VENDORED 1) - add_dependencies(orc_ep ZLIB::ZLIB) - add_dependencies(orc_ep lz4::lz4) - add_dependencies(orc_ep ${Snappy_TARGET}) - add_dependencies(orc_ep ${ARROW_PROTOBUF_LIBPROTOBUF}) add_library(orc::liborc STATIC IMPORTED) set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") + set(ORC_LINK_LIBRARIES LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) + if(NOT MSVC) + if(NOT APPLE) + list(APPEND ORC_LINK_LIBRARIES Threads::Threads) + endif() + list(APPEND ORC_LINK_LIBRARIES ${CMAKE_DL_LIBS}) + endif() + if(CMAKE_VERSION VERSION_LESS 3.11) + set_target_properties(orc::liborc PROPERTIES INTERFACE_LINK_LIBRARIES + "${ORC_LINK_LIBRARIES}") + else() + target_link_libraries(orc::liborc INTERFACE ${ORC_LINK_LIBRARIES}) + endif() add_dependencies(toolchain orc_ep) add_dependencies(orc::liborc orc_ep) @@ -4414,12 +4440,7 @@ macro(build_opentelemetry) endforeach() set(OPENTELEMETRY_CMAKE_ARGS - ${EP_COMMON_TOOLCHAIN} - "-DCMAKE_INSTALL_PREFIX=${OPENTELEMETRY_PREFIX}" - "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" - -DCMAKE_INSTALL_LIBDIR=lib - "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" - -DBUILD_TESTING=OFF + ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${OPENTELEMETRY_PREFIX}" -DWITH_EXAMPLES=OFF) set(OPENTELEMETRY_PREFIX_PATH_LIST) @@ -4456,7 +4477,7 @@ macro(build_opentelemetry) # ExternalProject that just fetches the Protobufs, then add a custom step # to the main build to copy the Protobufs. externalproject_add(opentelemetry_proto_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" URL ${OPENTELEMETRY_PROTO_SOURCE_URL} BUILD_COMMAND "" @@ -4467,19 +4488,17 @@ macro(build_opentelemetry) add_dependencies(opentelemetry_dependencies nlohmann_json::nlohmann_json opentelemetry_proto_ep ${ARROW_PROTOBUF_LIBPROTOBUF}) - set(OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR "|") # JOIN is CMake >=3.12 only - string(REPLACE ";" "${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR}" - OPENTELEMETRY_PREFIX_PATH "${OPENTELEMETRY_PREFIX_PATH_LIST}") + string(REPLACE ";" "${EP_LIST_SEPARATOR}" OPENTELEMETRY_PREFIX_PATH + "${OPENTELEMETRY_PREFIX_PATH_LIST}") list(APPEND OPENTELEMETRY_CMAKE_ARGS "-DCMAKE_PREFIX_PATH=${OPENTELEMETRY_PREFIX_PATH}") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "s390x") # OpenTelemetry tries to determine the processor arch for vcpkg, which fails # on s390x, even though it doesn't use vcpkg there. Tell it ARCH manually externalproject_add(opentelemetry_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - LIST_SEPARATOR ${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR} CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ARCH=s390x ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} "" @@ -4494,9 +4513,8 @@ macro(build_opentelemetry) DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) else() externalproject_add(opentelemetry_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - LIST_SEPARATOR ${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR} CMAKE_ARGS ${OPENTELEMETRY_CMAKE_ARGS} URL ${OPENTELEMETRY_SOURCE_URL} BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} @@ -4548,11 +4566,11 @@ macro(build_opentelemetry) foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) + list(APPEND ARROW_BUNDLED_STATIC_LIBS opentelemetry-cpp::${_OPENTELEMETRY_LIB}) endforeach() # Work around https://gitlab.kitware.com/cmake/cmake/issues/15052 file(MAKE_DIRECTORY ${OPENTELEMETRY_INCLUDE_DIR}) - endmacro() if(ARROW_WITH_OPENTELEMETRY) @@ -4571,13 +4589,8 @@ endif() macro(build_awssdk) message(STATUS "Building AWS C++ SDK from source") - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS - "4.9") - message(FATAL_ERROR "AWS C++ SDK requires gcc >= 4.9") - endif() set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install") set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include") - set(AWSSDK_LIB_DIR "lib") if(WIN32) # On Windows, need to match build types @@ -4591,20 +4604,15 @@ macro(build_awssdk) set(AWSSDK_COMMON_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} - -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=${AWSSDK_BUILD_TYPE} - -DCMAKE_INSTALL_LIBDIR=${AWSSDK_LIB_DIR} -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON "-DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}" "-DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX}") if(NOT MSVC) - list(APPEND - AWSSDK_COMMON_CMAKE_ARGS + list(APPEND AWSSDK_COMMON_CMAKE_ARGS # Workaround for https://github.com/aws/aws-sdk-cpp/issues/1582 - "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} -Wno-error=deprecated-declarations" - "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS} -Wno-error=deprecated-declarations" - ) + "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} -Wno-error=deprecated-declarations") endif() # provide hint for AWS SDK to link with the already located openssl @@ -4651,7 +4659,7 @@ macro(build_awssdk) # AWS-C-COMMON -> AWS_C_COMMON string(REPLACE "-" "_" _AWSSDK_LIB_NAME_PREFIX ${_AWSSDK_LIB_UPPER}) set(_AWSSDK_STATIC_LIBRARY - "${AWSSDK_PREFIX}/${AWSSDK_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${AWSSDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") set(_AWSSDK_TARGET_NAME ${_AWSSDK_LIB}) @@ -4668,7 +4676,7 @@ macro(build_awssdk) endforeach() externalproject_add(aws_c_common_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${AWS_C_COMMON_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWS_C_COMMON_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} @@ -4676,7 +4684,7 @@ macro(build_awssdk) add_dependencies(AWS::aws-c-common aws_c_common_ep) externalproject_add(aws_checksums_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${AWS_CHECKSUMS_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWS_CHECKSUMS_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} @@ -4685,7 +4693,7 @@ macro(build_awssdk) add_dependencies(AWS::aws-checksums aws_checksums_ep) externalproject_add(aws_c_event_stream_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${AWS_C_EVENT_STREAM_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWS_C_EVENT_STREAM_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} @@ -4702,7 +4710,7 @@ macro(build_awssdk) endif() externalproject_add(awssdk_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${AWSSDK_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWSSDK_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${AWSSDK_CMAKE_ARGS} @@ -4752,52 +4760,20 @@ macro(build_awssdk) endmacro() if(ARROW_S3) - # See https://aws.amazon.com/blogs/developer/developer-experience-of-the-aws-sdk-for-c-now-simplified-by-cmake/ + resolve_dependency(AWSSDK HAVE_ALT TRUE) - # Workaround to force AWS CMake configuration to look for shared libraries - if(DEFINED ENV{CONDA_PREFIX}) - if(DEFINED BUILD_SHARED_LIBS) - set(BUILD_SHARED_LIBS_WAS_SET TRUE) - set(BUILD_SHARED_LIBS_VALUE ${BUILD_SHARED_LIBS}) - else() - set(BUILD_SHARED_LIBS_WAS_SET FALSE) - endif() - set(BUILD_SHARED_LIBS "ON") - endif() + message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}") + message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}") - # Need to customize the find_package() call, so cannot call resolve_dependency() - if(AWSSDK_SOURCE STREQUAL "AUTO") - find_package(AWSSDK - COMPONENTS config - s3 - transfer - identity-management - sts) - if(NOT AWSSDK_FOUND) - build_awssdk() - endif() - elseif(AWSSDK_SOURCE STREQUAL "BUNDLED") - build_awssdk() - elseif(AWSSDK_SOURCE STREQUAL "SYSTEM") - find_package(AWSSDK REQUIRED - COMPONENTS config - s3 - transfer - identity-management - sts) + if(${AWSSDK_SOURCE} STREQUAL "SYSTEM") + foreach(AWSSDK_LINK_LIBRARY ${AWSSDK_LINK_LIBRARIES}) + string(APPEND ARROW_PC_LIBS_PRIVATE " $") + endforeach() endif() - - # Restore previous value of BUILD_SHARED_LIBS - if(DEFINED ENV{CONDA_PREFIX}) - if(BUILD_SHARED_LIBS_WAS_SET) - set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_VALUE}) - else() - unset(BUILD_SHARED_LIBS) - endif() + if(UNIX) + string(APPEND ARROW_PC_REQUIRES_PRIVATE " libcurl") endif() - - message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}") - message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}") + string(APPEND ARROW_PC_REQUIRES_PRIVATE " openssl") if(APPLE) # CoreFoundation's path is hardcoded in the CMake files provided by @@ -4852,7 +4828,7 @@ macro(build_ucx) endif() set(UCX_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS}) externalproject_add(ucx_ep - ${EP_LOG_OPTIONS} + ${EP_COMMON_OPTIONS} URL ${ARROW_UCX_SOURCE_URL} URL_HASH "SHA256=${ARROW_UCX_BUILD_SHA256_CHECKSUM}" CONFIGURE_COMMAND ${UCX_CONFIGURE_COMMAND} diff --git a/cpp/cmake_modules/UseCython.cmake b/cpp/cmake_modules/UseCython.cmake index f2025efb4c9..e15ac59490c 100644 --- a/cpp/cmake_modules/UseCython.cmake +++ b/cpp/cmake_modules/UseCython.cmake @@ -118,7 +118,7 @@ function(compile_pyx get_source_file_property(property_is_public ${pyx_file} CYTHON_PUBLIC) get_source_file_property(property_is_api ${pyx_file} CYTHON_API) if(${property_is_api}) - set(_generated_files "${output_file}" "${_name}.h" "${name}_api.h") + set(_generated_files "${output_file}" "${_name}.h" "${_name}_api.h") elseif(${property_is_public}) set(_generated_files "${output_file}" "${_name}.h") else() diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 88b760e3978..aa33c18e76e 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -36,10 +36,14 @@ if(ARROW_FLIGHT) # we'll violate ODR for gRPC symbols if(ARROW_GRPC_USE_SHARED) set(FLIGHT_EXAMPLES_LINK_LIBS arrow_flight_shared) - # We don't directly use symbols from the reflection library, so - # ensure the linker still links to it - set(GRPC_REFLECTION_LINK_LIBS -Wl,--no-as-needed gRPC::grpc++_reflection - -Wl,--as-needed) + if(APPLE) + set(GRPC_REFLECTION_LINK_LIBS gRPC::grpc++_reflection) + else() + # We don't directly use symbols from the reflection library, so + # ensure the linker still links to it + set(GRPC_REFLECTION_LINK_LIBS -Wl,--no-as-needed gRPC::grpc++_reflection + -Wl,--as-needed) + endif() elseif(NOT ARROW_BUILD_STATIC) message(FATAL_ERROR "Statically built gRPC requires ARROW_BUILD_STATIC=ON") else() @@ -114,6 +118,14 @@ if(ARROW_FLIGHT) endif() endif() +if(ARROW_PARQUET) + if(ARROW_BUILD_SHARED) + add_arrow_example(parquet_read_write EXTRA_LINK_LIBS parquet_shared) + else() + add_arrow_example(parquet_read_write EXTRA_LINK_LIBS parquet_static) + endif() +endif() + if(ARROW_PARQUET AND ARROW_DATASET) if(ARROW_BUILD_SHARED) set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared) @@ -133,8 +145,10 @@ if(ARROW_PARQUET AND ARROW_DATASET) ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(execution-plan-documentation-examples parquet) - add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(join-example parquet) + if(ARROW_CSV) + add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) + add_dependencies(join-example parquet) + endif() add_arrow_example(udf_example) @@ -151,3 +165,7 @@ if(ARROW_PARQUET AND ARROW_DATASET) endif() endif() + +if(ARROW_GANDIVA) + add_arrow_example(gandiva_example EXTRA_LINK_LIBS gandiva_shared) +endif() diff --git a/cpp/examples/arrow/compute_register_example.cc b/cpp/examples/arrow/compute_register_example.cc index 2a76e8595b6..1b96dd42220 100644 --- a/cpp/examples/arrow/compute_register_example.cc +++ b/cpp/examples/arrow/compute_register_example.cc @@ -57,7 +57,7 @@ class ExampleFunctionOptions : public cp::FunctionOptions { std::unique_ptr ExampleFunctionOptionsType::Copy( const cp::FunctionOptions&) const { - return std::unique_ptr(new ExampleFunctionOptions()); + return std::make_unique(); } arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecSpan& batch, @@ -149,7 +149,7 @@ arrow::Status RunComputeRegister(int argc, char** argv) { ARROW_RETURN_NOT_OK(maybe_plan.status()); ARROW_ASSIGN_OR_RAISE(auto plan, maybe_plan); - arrow::AsyncGenerator> source_gen, sink_gen; + arrow::AsyncGenerator> source_gen, sink_gen; ARROW_RETURN_NOT_OK( cp::Declaration::Sequence( { diff --git a/cpp/examples/arrow/engine_substrait_consumption.cc b/cpp/examples/arrow/engine_substrait_consumption.cc index 9d1fb99dcb5..aef189952c6 100644 --- a/cpp/examples/arrow/engine_substrait_consumption.cc +++ b/cpp/examples/arrow/engine_substrait_consumption.cc @@ -32,7 +32,8 @@ class IgnoringConsumer : public cp::SinkNodeConsumer { explicit IgnoringConsumer(size_t tag) : tag_{tag} {} arrow::Status Init(const std::shared_ptr& schema, - cp::BackpressureControl* backpressure_control) override { + cp::BackpressureControl* backpressure_control, + cp::ExecPlan* plan) override { return arrow::Status::OK(); } diff --git a/cpp/examples/arrow/execution_plan_documentation_examples.cc b/cpp/examples/arrow/execution_plan_documentation_examples.cc index b7c690bb278..a72db97930c 100644 --- a/cpp/examples/arrow/execution_plan_documentation_examples.cc +++ b/cpp/examples/arrow/execution_plan_documentation_examples.cc @@ -157,11 +157,11 @@ struct BatchesWithSchema { std::shared_ptr schema; // This method uses internal arrow utilities to // convert a vector of record batches to an AsyncGenerator of optional batches - arrow::AsyncGenerator> gen() const { + arrow::AsyncGenerator> gen() const { auto opt_batches = ::arrow::internal::MapVector( - [](cp::ExecBatch batch) { return arrow::util::make_optional(std::move(batch)); }, + [](cp::ExecBatch batch) { return std::make_optional(std::move(batch)); }, batches); - arrow::AsyncGenerator> gen; + arrow::AsyncGenerator> gen; gen = arrow::MakeVectorGenerator(std::move(opt_batches)); return gen; } @@ -256,182 +256,122 @@ arrow::Result MakeGroupableBatches(int multiplicity = 1) { return out; } -arrow::Status ExecutePlanAndCollectAsTable( - cp::ExecContext& exec_context, std::shared_ptr plan, - std::shared_ptr schema, - arrow::AsyncGenerator> sink_gen) { - // translate sink_gen (async) to sink_reader (sync) - std::shared_ptr sink_reader = - cp::MakeGeneratorReader(schema, std::move(sink_gen), exec_context.memory_pool()); - - // validate the ExecPlan - ARROW_RETURN_NOT_OK(plan->Validate()); - std::cout << "ExecPlan created : " << plan->ToString() << std::endl; - // start the ExecPlan - ARROW_RETURN_NOT_OK(plan->StartProducing()); - +arrow::Status ExecutePlanAndCollectAsTable(cp::Declaration plan) { // collect sink_reader into a Table std::shared_ptr response_table; - - ARROW_ASSIGN_OR_RAISE(response_table, - arrow::Table::FromRecordBatchReader(sink_reader.get())); + ARROW_ASSIGN_OR_RAISE(response_table, cp::DeclarationToTable(std::move(plan))); std::cout << "Results : " << response_table->ToString() << std::endl; - // stop producing - plan->StopProducing(); - // plan mark finished - auto future = plan->finished(); - return future.status(); + return arrow::Status::OK(); } // (Doc section: Scan Example) /// \brief An example demonstrating a scan and sink node -/// \param exec_context The execution context to run the plan in /// -/// Scan-Sink +/// Scan-Table /// This example shows how scan operation can be applied on a dataset. /// There are operations that can be applied on the scan (project, filter) /// and the input data can be processed. The output is obtained as a table -/// via the sink node. -arrow::Status ScanSinkExample(cp::ExecContext& exec_context) { - // Execution plan created - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +arrow::Status ScanSinkExample() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr dataset, GetDataset()); auto options = std::make_shared(); options->projection = cp::project({}, {}); // create empty projection // construct the scan node - cp::ExecNode* scan; auto scan_node_options = arrow::dataset::ScanNodeOptions{dataset, options}; - ARROW_ASSIGN_OR_RAISE(scan, - cp::MakeExecNode("scan", plan.get(), {}, scan_node_options)); + cp::Declaration scan{"scan", std::move(scan_node_options)}; - arrow::AsyncGenerator> sink_gen; - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {scan}, cp::SinkNodeOptions{&sink_gen})); - - return ExecutePlanAndCollectAsTable(exec_context, plan, dataset->schema(), sink_gen); + return ExecutePlanAndCollectAsTable(std::move(scan)); } // (Doc section: Scan Example) // (Doc section: Source Example) /// \brief An example demonstrating a source and sink node -/// \param exec_context The execution context to run the plan in /// -/// Source-Sink Example -/// This example shows how a source and sink can be used -/// in an execution plan. This includes source node receiving data -/// and the sink node emits the data as an output represented in -/// a table. -arrow::Status SourceSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// Source-Table Example +/// This example shows how a custom source can be used +/// in an execution plan. This includes source node using pregenerated +/// data and collecting it into a table. +/// +/// This sort of custom souce is often not needed. In most cases you can +/// use a scan (for a dataset source) or a source like table_source, array_vector_source, +/// exec_batch_source, or record_batch_source (for in-memory data) +arrow::Status SourceSinkExample() { ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - arrow::AsyncGenerator> sink_gen; - auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; - ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, - cp::MakeExecNode("source", plan.get(), {}, source_node_options)); - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {source}, cp::SinkNodeOptions{&sink_gen})); + cp::Declaration source{"source", std::move(source_node_options)}; - return ExecutePlanAndCollectAsTable(exec_context, plan, basic_data.schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(source)); } // (Doc section: Source Example) // (Doc section: Table Source Example) /// \brief An example showing a table source node -/// \param exec_context The execution context to run the plan in /// -/// TableSource-Sink Example -/// This example shows how a table_source and sink can be used +/// TableSource-Table Example +/// This example shows how a table_source can be used /// in an execution plan. This includes a table source node -/// receiving data from a table and the sink node emits -/// the data to a generator which we collect into a table. -arrow::Status TableSourceSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// receiving data from a table. This plan simply collects the +/// data back into a table but nodes could be added that modify +/// or transform the data as well (as is shown in later examples) +arrow::Status TableSourceSinkExample() { ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); - arrow::AsyncGenerator> sink_gen; + arrow::AsyncGenerator> sink_gen; int max_batch_size = 2; auto table_source_options = cp::TableSourceNodeOptions{table, max_batch_size}; - ARROW_ASSIGN_OR_RAISE( - cp::ExecNode * source, - cp::MakeExecNode("table_source", plan.get(), {}, table_source_options)); - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {source}, cp::SinkNodeOptions{&sink_gen})); + cp::Declaration source{"table_source", std::move(table_source_options)}; - return ExecutePlanAndCollectAsTable(exec_context, plan, table->schema(), sink_gen); + return ExecutePlanAndCollectAsTable(std::move(source)); } // (Doc section: Table Source Example) // (Doc section: Filter Example) /// \brief An example showing a filter node -/// \param exec_context The execution context to run the plan in /// -/// Source-Filter-Sink +/// Source-Filter-Table /// This example shows how a filter can be used in an execution plan, -/// along with the source and sink operations. The output from the -/// exeuction plan is obtained as a table via the sink node. -arrow::Status ScanFilterSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// to filter data from a source. The output from the exeuction plan +/// is collected into a table. +arrow::Status ScanFilterSinkExample() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr dataset, GetDataset()); auto options = std::make_shared(); // specify the filter. This filter removes all rows where the // value of the "a" column is greater than 3. - cp::Expression filter_opt = cp::greater(cp::field_ref("a"), cp::literal(3)); + cp::Expression filter_expr = cp::greater(cp::field_ref("a"), cp::literal(3)); // set filter for scanner : on-disk / push-down filtering. // This step can be skipped if you are not reading from disk. - options->filter = filter_opt; + options->filter = filter_expr; // empty projection options->projection = cp::project({}, {}); // construct the scan node std::cout << "Initialized Scanning Options" << std::endl; - cp::ExecNode* scan; - auto scan_node_options = arrow::dataset::ScanNodeOptions{dataset, options}; std::cout << "Scan node options created" << std::endl; - ARROW_ASSIGN_OR_RAISE(scan, - cp::MakeExecNode("scan", plan.get(), {}, scan_node_options)); + cp::Declaration scan{"scan", std::move(scan_node_options)}; // pipe the scan node into the filter node // Need to set the filter in scan node options and filter node options. // At scan node it is used for on-disk / push-down filtering. // At filter node it is used for in-memory filtering. - cp::ExecNode* filter; - ARROW_ASSIGN_OR_RAISE(filter, cp::MakeExecNode("filter", plan.get(), {scan}, - cp::FilterNodeOptions{filter_opt})); + cp::Declaration filter{ + "filter", {std::move(scan)}, cp::FilterNodeOptions(std::move(filter_expr))}; - // finally, pipe the filter node into a sink node - arrow::AsyncGenerator> sink_gen; - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {filter}, cp::SinkNodeOptions{&sink_gen})); - - return ExecutePlanAndCollectAsTable(exec_context, plan, dataset->schema(), sink_gen); + return ExecutePlanAndCollectAsTable(std::move(filter)); } // (Doc section: Filter Example) @@ -439,16 +379,12 @@ arrow::Status ScanFilterSinkExample(cp::ExecContext& exec_context) { // (Doc section: Project Example) /// \brief An example showing a project node -/// \param exec_context The execution context to run the plan in /// -/// Scan-Project-Sink -/// This example shows how Scan operation can be used to load the data -/// into the execution plan, how project operation can be applied on the -/// data stream and how the output is obtained as a table via the sink node. -arrow::Status ScanProjectSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// Scan-Project-Table +/// This example shows how a Scan operation can be used to load the data +/// into the execution plan, how a project operation can be applied on the +/// data stream and how the output is collected into a table +arrow::Status ScanProjectSinkExample() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr dataset, GetDataset()); auto options = std::make_shared(); @@ -456,26 +392,13 @@ arrow::Status ScanProjectSinkExample(cp::ExecContext& exec_context) { cp::Expression a_times_2 = cp::call("multiply", {cp::field_ref("a"), cp::literal(2)}); options->projection = cp::project({}, {}); - cp::ExecNode* scan; - auto scan_node_options = arrow::dataset::ScanNodeOptions{dataset, options}; - ARROW_ASSIGN_OR_RAISE(scan, - cp::MakeExecNode("scan", plan.get(), {}, scan_node_options)); - - cp::ExecNode* project; - ARROW_ASSIGN_OR_RAISE(project, cp::MakeExecNode("project", plan.get(), {scan}, - cp::ProjectNodeOptions{{a_times_2}})); - // schema after projection => multiply(a, 2): int64 - std::cout << "Schema after projection : \n" - << project->output_schema()->ToString() << std::endl; - - arrow::AsyncGenerator> sink_gen; - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {project}, cp::SinkNodeOptions{&sink_gen})); - auto schema = arrow::schema({arrow::field("a * 2", arrow::int32())}); + cp::Declaration scan{"scan", std::move(scan_node_options)}; + cp::Declaration project{ + "project", {std::move(scan)}, cp::ProjectNodeOptions({a_times_2})}; - return ExecutePlanAndCollectAsTable(exec_context, plan, schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(project)); } // (Doc section: Project Example) @@ -483,98 +406,70 @@ arrow::Status ScanProjectSinkExample(cp::ExecContext& exec_context) { // (Doc section: Scalar Aggregate Example) /// \brief An example showing an aggregation node to aggregate an entire table -/// \param exec_context The execution context to run the plan in /// -/// Source-Aggregation-Sink +/// Source-Aggregation-Table /// This example shows how an aggregation operation can be applied on a -/// execution plan resulting a scalar output. The source node loads the +/// execution plan resulting in a scalar output. The source node loads the /// data and the aggregation (counting unique types in column 'a') -/// is applied on this data. The output is obtained from the sink node as a table. -arrow::Status SourceScalarAggregateSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// is applied on this data. The output is collected into a table (that will +/// have exactly one row) +arrow::Status SourceScalarAggregateSinkExample() { ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - arrow::AsyncGenerator> sink_gen; - auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; - ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, - cp::MakeExecNode("source", plan.get(), {}, source_node_options)); + cp::Declaration source{"source", std::move(source_node_options)}; auto aggregate_options = cp::AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "a", "sum(a)"}}}; - ARROW_ASSIGN_OR_RAISE( - cp::ExecNode * aggregate, - cp::MakeExecNode("aggregate", plan.get(), {source}, std::move(aggregate_options))); - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {aggregate}, cp::SinkNodeOptions{&sink_gen})); - auto schema = arrow::schema({arrow::field("sum(a)", arrow::int32())}); + cp::Declaration aggregate{ + "aggregate", {std::move(source)}, std::move(aggregate_options)}; - return ExecutePlanAndCollectAsTable(exec_context, plan, schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(aggregate)); } // (Doc section: Scalar Aggregate Example) // (Doc section: Group Aggregate Example) /// \brief An example showing an aggregation node to perform a group-by operation -/// \param exec_context The execution context to run the plan in /// -/// Source-Aggregation-Sink +/// Source-Aggregation-Table /// This example shows how an aggregation operation can be applied on a -/// execution plan resulting a grouped output. The source node loads the +/// execution plan resulting in grouped output. The source node loads the /// data and the aggregation (counting unique types in column 'a') is -/// applied on this data. The output is obtained from the sink node as a table. -arrow::Status SourceGroupAggregateSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +/// applied on this data. The output is collected into a table that will contain +/// one row for each unique combination of group keys. +arrow::Status SourceGroupAggregateSinkExample() { ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - arrow::AsyncGenerator> sink_gen; + arrow::AsyncGenerator> sink_gen; auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; - ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, - cp::MakeExecNode("source", plan.get(), {}, source_node_options)); + cp::Declaration source{"source", std::move(source_node_options)}; auto options = std::make_shared(cp::CountOptions::ONLY_VALID); auto aggregate_options = cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", options, "a", "count(a)"}}, /*keys=*/{"b"}}; - ARROW_ASSIGN_OR_RAISE( - cp::ExecNode * aggregate, - cp::MakeExecNode("aggregate", plan.get(), {source}, aggregate_options)); - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {aggregate}, cp::SinkNodeOptions{&sink_gen})); - auto schema = arrow::schema({ - arrow::field("count(a)", arrow::int32()), - arrow::field("b", arrow::boolean()), - }); + cp::Declaration aggregate{ + "aggregate", {std::move(source)}, std::move(aggregate_options)}; - return ExecutePlanAndCollectAsTable(exec_context, plan, schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(aggregate)); } // (Doc section: Group Aggregate Example) // (Doc section: ConsumingSink Example) /// \brief An example showing a consuming sink node -/// \param exec_context The execution context to run the plan in /// /// Source-Consuming-Sink /// This example shows how the data can be consumed within the execution plan /// by using a ConsumingSink node. There is no data output from this execution plan. -arrow::Status SourceConsumingSinkExample(cp::ExecContext& exec_context) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +arrow::Status SourceConsumingSinkExample() { ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; - ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, - cp::MakeExecNode("source", plan.get(), {}, source_node_options)); + cp::Declaration source{"source", std::move(source_node_options)}; std::atomic batches_seen{0}; arrow::Future<> finish = arrow::Future<>::Make(); @@ -583,7 +478,12 @@ arrow::Status SourceConsumingSinkExample(cp::ExecContext& exec_context) { : batches_seen(batches_seen), finish(std::move(finish)) {} arrow::Status Init(const std::shared_ptr& schema, - cp::BackpressureControl* backpressure_control) override { + cp::BackpressureControl* backpressure_control, + cp::ExecPlan* plan) override { + // This will be called as the plan is started (before the first call to Consume) + // and provides the schema of the data coming into the node, controls for pausing / + // resuming input, and a pointer to the plan itself which can be used to access + // other utilities such as the thread indexer or async task scheduler. return arrow::Status::OK(); } @@ -592,7 +492,11 @@ arrow::Status SourceConsumingSinkExample(cp::ExecContext& exec_context) { return arrow::Status::OK(); } - arrow::Future<> Finish() override { return finish; } + arrow::Future<> Finish() override { + // Here you can perform whatever (possibly async) cleanup is needed, e.g. closing + // output file handles and flushing remaining work + return arrow::Future<>::MakeFinished(); + } std::atomic* batches_seen; arrow::Future<> finish; @@ -600,47 +504,64 @@ arrow::Status SourceConsumingSinkExample(cp::ExecContext& exec_context) { std::shared_ptr consumer = std::make_shared(&batches_seen, finish); - cp::ExecNode* consuming_sink; - - ARROW_ASSIGN_OR_RAISE(consuming_sink, - MakeExecNode("consuming_sink", plan.get(), {source}, - cp::ConsumingSinkNodeOptions(consumer))); + cp::Declaration consuming_sink{"consuming_sink", + {std::move(source)}, + cp::ConsumingSinkNodeOptions(std::move(consumer))}; - ARROW_RETURN_NOT_OK(consuming_sink->Validate()); + // Since we are consuming the data within the plan there is no output and we simply + // run the plan to completion instead of collecting into a table. + ARROW_RETURN_NOT_OK(cp::DeclarationToStatus(std::move(consuming_sink))); - ARROW_RETURN_NOT_OK(plan->Validate()); - std::cout << "Exec Plan created: " << plan->ToString() << std::endl; - // plan start producing - ARROW_RETURN_NOT_OK(plan->StartProducing()); - // Source should finish fairly quickly - ARROW_RETURN_NOT_OK(source->finished().status()); - std::cout << "Source Finished!" << std::endl; - // Mark consumption complete, plan should finish - finish.MarkFinished(arrow::Status::OK()); - ARROW_RETURN_NOT_OK(plan->finished().status()); + std::cout << "The consuming sink node saw " << batches_seen.load() << " batches" + << std::endl; return arrow::Status::OK(); } // (Doc section: ConsumingSink Example) // (Doc section: OrderBySink Example) +arrow::Status ExecutePlanAndCollectAsTableWithCustomSink( + std::shared_ptr plan, std::shared_ptr schema, + arrow::AsyncGenerator> sink_gen) { + // translate sink_gen (async) to sink_reader (sync) + std::shared_ptr sink_reader = + cp::MakeGeneratorReader(schema, std::move(sink_gen), arrow::default_memory_pool()); + + // validate the ExecPlan + ARROW_RETURN_NOT_OK(plan->Validate()); + std::cout << "ExecPlan created : " << plan->ToString() << std::endl; + // start the ExecPlan + ARROW_RETURN_NOT_OK(plan->StartProducing()); + + // collect sink_reader into a Table + std::shared_ptr response_table; + + ARROW_ASSIGN_OR_RAISE(response_table, + arrow::Table::FromRecordBatchReader(sink_reader.get())); + + std::cout << "Results : " << response_table->ToString() << std::endl; + + // stop producing + plan->StopProducing(); + // plan mark finished + auto future = plan->finished(); + return future.status(); +} + /// \brief An example showing an order-by node -/// \param exec_context The execution context to run the plan in /// /// Source-OrderBy-Sink /// In this example, the data enters through the source node /// and the data is ordered in the sink node. The order can be /// ASCENDING or DESCENDING and it is configurable. The output /// is obtained as a table from the sink node. -arrow::Status SourceOrderBySinkExample(cp::ExecContext& exec_context) { +arrow::Status SourceOrderBySinkExample() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); + cp::ExecPlan::Make(*cp::threaded_exec_context())); ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeSortTestBasicBatches()); - std::cout << "basic data created" << std::endl; - - arrow::AsyncGenerator> sink_gen; + arrow::AsyncGenerator> sink_gen; auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, @@ -651,7 +572,7 @@ arrow::Status SourceOrderBySinkExample(cp::ExecContext& exec_context) { cp::OrderBySinkNodeOptions{ cp::SortOptions{{cp::SortKey{"a", cp::SortOrder::Descending}}}, &sink_gen})); - return ExecutePlanAndCollectAsTable(exec_context, plan, basic_data.schema, sink_gen); + return ExecutePlanAndCollectAsTableWithCustomSink(plan, basic_data.schema, sink_gen); } // (Doc section: OrderBySink Example) @@ -659,44 +580,26 @@ arrow::Status SourceOrderBySinkExample(cp::ExecContext& exec_context) { // (Doc section: HashJoin Example) /// \brief An example showing a hash join node -/// \param exec_context The execution context to run the plan in /// -/// Source-HashJoin-Sink +/// Source-HashJoin-Table /// This example shows how source node gets the data and how a self-join /// is applied on the data. The join options are configurable. The output -/// is obtained as a table via the sink node. -arrow::Status SourceHashJoinSinkExample(cp::ExecContext& exec_context) { +/// is collected into a table. +arrow::Status SourceHashJoinSinkExample() { ARROW_ASSIGN_OR_RAISE(auto input, MakeGroupableBatches()); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - - arrow::AsyncGenerator> sink_gen; - cp::ExecNode* left_source; - cp::ExecNode* right_source; - for (auto source : {&left_source, &right_source}) { - ARROW_ASSIGN_OR_RAISE(*source, - MakeExecNode("source", plan.get(), {}, - cp::SourceNodeOptions{input.schema, input.gen()})); - } + cp::Declaration left{"source", cp::SourceNodeOptions{input.schema, input.gen()}}; + cp::Declaration right{"source", cp::SourceNodeOptions{input.schema, input.gen()}}; cp::HashJoinNodeOptions join_opts{ cp::JoinType::INNER, /*left_keys=*/{"str"}, /*right_keys=*/{"str"}, cp::literal(true), "l_", "r_"}; - ARROW_ASSIGN_OR_RAISE( - auto hashjoin, - cp::MakeExecNode("hashjoin", plan.get(), {left_source, right_source}, join_opts)); - - ARROW_RETURN_NOT_OK( - cp::MakeExecNode("sink", plan.get(), {hashjoin}, cp::SinkNodeOptions{&sink_gen})); - // expected columns i32, str, l_str, r_str - auto schema = arrow::schema( - {arrow::field("i32", arrow::int32()), arrow::field("str", arrow::utf8()), - arrow::field("l_str", arrow::utf8()), arrow::field("r_str", arrow::utf8())}); + cp::Declaration hashjoin{ + "hashjoin", {std::move(left), std::move(right)}, std::move(join_opts)}; - return ExecutePlanAndCollectAsTable(exec_context, plan, schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(hashjoin)); } // (Doc section: HashJoin Example) @@ -704,17 +607,16 @@ arrow::Status SourceHashJoinSinkExample(cp::ExecContext& exec_context) { // (Doc section: KSelect Example) /// \brief An example showing a select-k node -/// \param exec_context The execution context to run the plan in /// /// Source-KSelect /// This example shows how K number of elements can be selected /// either from the top or bottom. The output node is a modified /// sink node where output can be obtained as a table. -arrow::Status SourceKSelectExample(cp::ExecContext& exec_context) { +arrow::Status SourceKSelectExample() { ARROW_ASSIGN_OR_RAISE(auto input, MakeGroupableBatches()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - arrow::AsyncGenerator> sink_gen; + cp::ExecPlan::Make(*cp::threaded_exec_context())); + arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE( cp::ExecNode * source, @@ -729,7 +631,7 @@ arrow::Status SourceKSelectExample(cp::ExecContext& exec_context) { auto schema = arrow::schema( {arrow::field("i32", arrow::int32()), arrow::field("str", arrow::utf8())}); - return ExecutePlanAndCollectAsTable(exec_context, plan, schema, sink_gen); + return ExecutePlanAndCollectAsTableWithCustomSink(plan, schema, sink_gen); } // (Doc section: KSelect Example) @@ -737,31 +639,23 @@ arrow::Status SourceKSelectExample(cp::ExecContext& exec_context) { // (Doc section: Write Example) /// \brief An example showing a write node -/// \param exec_context The execution context to run the plan in /// \param file_path The destination to write to /// /// Scan-Filter-Write /// This example shows how scan node can be used to load the data /// and after processing how it can be written to disk. -arrow::Status ScanFilterWriteExample(cp::ExecContext& exec_context, - const std::string& file_path) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - +arrow::Status ScanFilterWriteExample(const std::string& file_path) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr dataset, GetDataset()); auto options = std::make_shared(); // empty projection options->projection = cp::project({}, {}); - cp::ExecNode* scan; - auto scan_node_options = arrow::dataset::ScanNodeOptions{dataset, options}; - ARROW_ASSIGN_OR_RAISE(scan, - cp::MakeExecNode("scan", plan.get(), {}, scan_node_options)); + cp::Declaration scan{"scan", std::move(scan_node_options)}; - arrow::AsyncGenerator> sink_gen; + arrow::AsyncGenerator> sink_gen; std::string root_path = ""; std::string uri = "file://" + file_path; @@ -792,15 +686,13 @@ arrow::Status ScanFilterWriteExample(cp::ExecContext& exec_context, arrow::dataset::WriteNodeOptions write_node_options{write_options}; - ARROW_RETURN_NOT_OK(cp::MakeExecNode("write", plan.get(), {scan}, write_node_options)); + cp::Declaration write{"write", {std::move(scan)}, std::move(write_node_options)}; - ARROW_RETURN_NOT_OK(plan->Validate()); - std::cout << "Execution Plan Created : " << plan->ToString() << std::endl; - // // // start the ExecPlan - ARROW_RETURN_NOT_OK(plan->StartProducing()); - auto future = plan->finished(); - ARROW_RETURN_NOT_OK(future.status()); - future.Wait(); + // Since the write node has no output we simply run the plan to completion and the + // data should be written + ARROW_RETURN_NOT_OK(cp::DeclarationToStatus(std::move(write))); + + std::cout << "Dataset written to " << base_path << std::endl; return arrow::Status::OK(); } @@ -809,41 +701,23 @@ arrow::Status ScanFilterWriteExample(cp::ExecContext& exec_context, // (Doc section: Union Example) /// \brief An example showing a union node -/// \param exec_context The execution context to run the plan in /// -/// Source-Union-Sink +/// Source-Union-Table /// This example shows how a union operation can be applied on two -/// data sources. The output is obtained as a table via the sink -/// node. -arrow::Status SourceUnionSinkExample(cp::ExecContext& exec_context) { +/// data sources. The output is collected into a table. +arrow::Status SourceUnionSinkExample() { ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - arrow::AsyncGenerator> sink_gen; - - cp::Declaration union_node{"union", cp::ExecNodeOptions{}}; cp::Declaration lhs{"source", cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}}; lhs.label = "lhs"; cp::Declaration rhs{"source", cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}}; rhs.label = "rhs"; - union_node.inputs.emplace_back(lhs); - union_node.inputs.emplace_back(rhs); - - cp::CountOptions options(cp::CountOptions::ONLY_VALID); - ARROW_ASSIGN_OR_RAISE( - auto declr, cp::Declaration::Sequence({ - union_node, - {"sink", cp::SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + cp::Declaration union_plan{ + "union", {std::move(lhs), std::move(rhs)}, cp::ExecNodeOptions{}}; - ARROW_RETURN_NOT_OK(declr->Validate()); - - ARROW_RETURN_NOT_OK(plan->Validate()); - return ExecutePlanAndCollectAsTable(exec_context, plan, basic_data.schema, sink_gen); + return ExecutePlanAndCollectAsTable(std::move(union_plan)); } // (Doc section: Union Example) @@ -851,16 +725,15 @@ arrow::Status SourceUnionSinkExample(cp::ExecContext& exec_context) { // (Doc section: Table Sink Example) /// \brief An example showing a table sink node -/// \param exec_context The execution context to run the plan in /// /// TableSink Example /// This example shows how a table_sink can be used /// in an execution plan. This includes a source node /// receiving data as batches and the table sink node /// which emits the output as a table. -arrow::Status TableSinkExample(cp::ExecContext& exec_context) { +arrow::Status TableSinkExample() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); + cp::ExecPlan::Make(*cp::threaded_exec_context())); ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); @@ -886,8 +759,29 @@ arrow::Status TableSinkExample(cp::ExecContext& exec_context) { std::cout << "Results : " << output_table->ToString() << std::endl; return arrow::Status::OK(); } + // (Doc section: Table Sink Example) +// (Doc section: RecordBatchReaderSource Example) + +/// \brief An example showing the usage of a RecordBatchReader as the data source. +/// +/// RecordBatchReaderSourceSink Example +/// This example shows how a record_batch_reader_source can be used +/// in an execution plan. This includes the source node +/// receiving data from a TableRecordBatchReader. + +arrow::Status RecordBatchReaderSourceSinkExample() { + ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); + std::shared_ptr reader = + std::make_shared(table); + cp::Declaration reader_source{"record_batch_reader_source", + cp::RecordBatchReaderSourceNodeOptions{reader}}; + return ExecutePlanAndCollectAsTable(std::move(reader_source)); +} + +// (Doc section: RecordBatchReaderSource Example) + enum ExampleMode { SOURCE_SINK = 0, TABLE_SOURCE_SINK = 1, @@ -902,11 +796,12 @@ enum ExampleMode { KSELECT = 10, WRITE = 11, UNION = 12, - TABLE_SOURCE_TABLE_SINK = 13 + TABLE_SOURCE_TABLE_SINK = 13, + RECORD_BATCH_READER_SOURCE = 14 }; int main(int argc, char** argv) { - if (argc < 2) { + if (argc < 3) { // Fake success for CI purposes. return EXIT_SUCCESS; } @@ -916,64 +811,66 @@ int main(int argc, char** argv) { arrow::Status status; // ensure arrow::dataset node factories are in the registry arrow::dataset::internal::Initialize(); - // execution context - cp::ExecContext exec_context; switch (mode) { case SOURCE_SINK: PrintBlock("Source Sink Example"); - status = SourceSinkExample(exec_context); + status = SourceSinkExample(); break; case TABLE_SOURCE_SINK: PrintBlock("Table Source Sink Example"); - status = TableSourceSinkExample(exec_context); + status = TableSourceSinkExample(); break; case SCAN: PrintBlock("Scan Example"); - status = ScanSinkExample(exec_context); + status = ScanSinkExample(); break; case FILTER: PrintBlock("Filter Example"); - status = ScanFilterSinkExample(exec_context); + status = ScanFilterSinkExample(); break; case PROJECT: PrintBlock("Project Example"); - status = ScanProjectSinkExample(exec_context); + status = ScanProjectSinkExample(); break; case GROUP_AGGREGATION: PrintBlock("Aggregate Example"); - status = SourceGroupAggregateSinkExample(exec_context); + status = SourceGroupAggregateSinkExample(); break; case SCALAR_AGGREGATION: PrintBlock("Aggregate Example"); - status = SourceScalarAggregateSinkExample(exec_context); + status = SourceScalarAggregateSinkExample(); break; case CONSUMING_SINK: PrintBlock("Consuming-Sink Example"); - status = SourceConsumingSinkExample(exec_context); + status = SourceConsumingSinkExample(); break; case ORDER_BY_SINK: PrintBlock("OrderBy Example"); - status = SourceOrderBySinkExample(exec_context); + status = SourceOrderBySinkExample(); break; case HASHJOIN: PrintBlock("HashJoin Example"); - status = SourceHashJoinSinkExample(exec_context); + status = SourceHashJoinSinkExample(); break; case KSELECT: PrintBlock("KSelect Example"); - status = SourceKSelectExample(exec_context); + status = SourceKSelectExample(); break; case WRITE: PrintBlock("Write Example"); - status = ScanFilterWriteExample(exec_context, base_save_path); + status = ScanFilterWriteExample(base_save_path); break; case UNION: PrintBlock("Union Example"); - status = SourceUnionSinkExample(exec_context); + status = SourceUnionSinkExample(); break; case TABLE_SOURCE_TABLE_SINK: PrintBlock("TableSink Example"); - status = TableSinkExample(exec_context); + status = TableSinkExample(); + break; + case RECORD_BATCH_READER_SOURCE: + PrintBlock("RecordBatchReaderSource Example"); + status = RecordBatchReaderSourceSinkExample(); break; default: break; diff --git a/cpp/examples/arrow/gandiva_example.cc b/cpp/examples/arrow/gandiva_example.cc new file mode 100644 index 00000000000..fb571a7c5a7 --- /dev/null +++ b/cpp/examples/arrow/gandiva_example.cc @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/api.h" +#include "arrow/compute/api_vector.h" +#include "arrow/status.h" + +#include "gandiva/filter.h" +#include "gandiva/projector.h" +#include "gandiva/selection_vector.h" +#include "gandiva/tree_expr_builder.h" + +#include + +using arrow::Datum; +using arrow::Status; +using arrow::compute::TakeOptions; +using gandiva::Condition; +using gandiva::ConfigurationBuilder; +using gandiva::Expression; +using gandiva::Filter; +using gandiva::Node; +using gandiva::Projector; +using gandiva::SelectionVector; +using gandiva::TreeExprBuilder; + +Status Example() { + //(Doc section: Create expressions) + std::shared_ptr field_x_raw = arrow::field("x", arrow::int32()); + std::shared_ptr field_x = TreeExprBuilder::MakeField(field_x_raw); + std::shared_ptr literal_3 = TreeExprBuilder::MakeLiteral(3); + std::shared_ptr field_result = arrow::field("result", arrow::int32()); + + std::shared_ptr add_node = + TreeExprBuilder::MakeFunction("add", {field_x, literal_3}, arrow::int32()); + std::shared_ptr expression = + TreeExprBuilder::MakeExpression(add_node, field_result); + + std::shared_ptr less_than_node = + TreeExprBuilder::MakeFunction("less_than", {field_x, literal_3}, arrow::boolean()); + std::shared_ptr condition = TreeExprBuilder::MakeCondition(less_than_node); + //(Doc section: Create expressions) + + //(Doc section: Create projector and filter) + std::shared_ptr input_schema = arrow::schema({field_x_raw}); + std::shared_ptr output_schema = arrow::schema({field_result}); + std::shared_ptr projector; + Status status; + std::vector> expressions = {expression}; + status = Projector::Make(input_schema, expressions, &projector); + ARROW_RETURN_NOT_OK(status); + + std::shared_ptr filter; + status = Filter::Make(input_schema, condition, &filter); + ARROW_RETURN_NOT_OK(status); + //(Doc section: Create projector and filter) + + //(Doc section: Evaluate projection) + auto pool = arrow::default_memory_pool(); + int num_records = 4; + arrow::Int32Builder builder; + int32_t values[4] = {1, 2, 3, 4}; + ARROW_RETURN_NOT_OK(builder.AppendValues(values, 4)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr array, builder.Finish()); + auto in_batch = arrow::RecordBatch::Make(input_schema, num_records, {array}); + + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool, &outputs); + ARROW_RETURN_NOT_OK(status); + std::shared_ptr result = + arrow::RecordBatch::Make(output_schema, outputs[0]->length(), outputs); + //(Doc section: Evaluate projection) + + std::cout << "Project result:" << std::endl; + std::cout << result->ToString() << std::endl; + + //(Doc section: Evaluate filter) + std::shared_ptr result_indices; + // Use 16-bit integers for indices. Result can be no longer than input size, + // so use batch num_rows as max_slots. + status = gandiva::SelectionVector::MakeInt16(/*max_slots=*/in_batch->num_rows(), pool, + &result_indices); + ARROW_RETURN_NOT_OK(status); + status = filter->Evaluate(*in_batch, result_indices); + ARROW_RETURN_NOT_OK(status); + std::shared_ptr take_indices = result_indices->ToArray(); + Datum maybe_batch; + ARROW_ASSIGN_OR_RAISE(maybe_batch, + arrow::compute::Take(Datum(in_batch), Datum(take_indices), + TakeOptions::NoBoundsCheck())); + result = maybe_batch.record_batch(); + //(Doc section: Evaluate filter) + + std::cout << "Filter result:" << std::endl; + std::cout << result->ToString() << std::endl; + + //(Doc section: Evaluate filter and projection) + // Make sure the projector is compiled for the appropriate selection vector mode + status = Projector::Make(input_schema, expressions, result_indices->GetMode(), + ConfigurationBuilder::DefaultConfiguration(), &projector); + ARROW_RETURN_NOT_OK(status); + + arrow::ArrayVector outputs_filtered; + status = projector->Evaluate(*in_batch, result_indices.get(), pool, &outputs_filtered); + ARROW_RETURN_NOT_OK(status); + + result = + arrow::RecordBatch::Make(output_schema, outputs[0]->length(), outputs_filtered); + //(Doc section: Evaluate filter and projection) + + std::cout << "Project + filter result:" << std::endl; + std::cout << result->ToString() << std::endl; + + return Status::OK(); +} + +int main(int argc, char** argv) { + arrow::Status status = Example(); + + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/join_example.cc b/cpp/examples/arrow/join_example.cc index e531bfbfbf9..eb7a8678a6e 100644 --- a/cpp/examples/arrow/join_example.cc +++ b/cpp/examples/arrow/join_example.cc @@ -63,7 +63,7 @@ arrow::Result> CreateDataSetFromCSVData std::shared_ptr input; std::string csv_data = is_left ? kLeftRelationCsvData : kRightRelationCsvData; std::cout << csv_data << std::endl; - arrow::util::string_view sv = csv_data; + std::string_view sv = csv_data; input = std::make_shared(sv); auto read_options = arrow::csv::ReadOptions::Defaults(); auto parse_options = arrow::csv::ParseOptions::Defaults(); @@ -82,18 +82,8 @@ arrow::Result> CreateDataSetFromCSVData } arrow::Status DoHashJoin() { - cp::ExecContext exec_context; - arrow::dataset::internal::Initialize(); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - cp::ExecPlan::Make(&exec_context)); - - arrow::AsyncGenerator> sink_gen; - - cp::ExecNode* left_source; - cp::ExecNode* right_source; - ARROW_ASSIGN_OR_RAISE(auto l_dataset, CreateDataSetFromCSVData(true)); ARROW_ASSIGN_OR_RAISE(auto r_dataset, CreateDataSetFromCSVData(false)); @@ -111,10 +101,8 @@ arrow::Status DoHashJoin() { auto l_scan_node_options = arrow::dataset::ScanNodeOptions{l_dataset, l_options}; auto r_scan_node_options = arrow::dataset::ScanNodeOptions{r_dataset, r_options}; - ARROW_ASSIGN_OR_RAISE(left_source, - cp::MakeExecNode("scan", plan.get(), {}, l_scan_node_options)); - ARROW_ASSIGN_OR_RAISE(right_source, - cp::MakeExecNode("scan", plan.get(), {}, r_scan_node_options)); + arrow::compute::Declaration left{"scan", std::move(l_scan_node_options)}; + arrow::compute::Declaration right{"scan", std::move(r_scan_node_options)}; arrow::compute::HashJoinNodeOptions join_opts{arrow::compute::JoinType::INNER, /*in_left_keys=*/{"lkey"}, @@ -123,26 +111,12 @@ arrow::Status DoHashJoin() { /*output_suffix_for_left*/ "_l", /*output_suffix_for_right*/ "_r"}; - ARROW_ASSIGN_OR_RAISE( - auto hashjoin, - cp::MakeExecNode("hashjoin", plan.get(), {left_source, right_source}, join_opts)); + arrow::compute::Declaration hashjoin{ + "hashjoin", {std::move(left), std::move(right)}, join_opts}; - ARROW_ASSIGN_OR_RAISE(std::ignore, cp::MakeExecNode("sink", plan.get(), {hashjoin}, - cp::SinkNodeOptions{&sink_gen})); // expected columns l_a, l_b - std::shared_ptr sink_reader = cp::MakeGeneratorReader( - hashjoin->output_schema(), std::move(sink_gen), exec_context.memory_pool()); - - // validate the ExecPlan - ARROW_RETURN_NOT_OK(plan->Validate()); - // start the ExecPlan - ARROW_RETURN_NOT_OK(plan->StartProducing()); - - // collect sink_reader into a Table - std::shared_ptr response_table; - - ARROW_ASSIGN_OR_RAISE(response_table, - arrow::Table::FromRecordBatchReader(sink_reader.get())); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr response_table, + arrow::compute::DeclarationToTable(std::move(hashjoin))); std::cout << "Results : " << response_table->ToString() << std::endl; diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc new file mode 100644 index 00000000000..3b8b4c2212b --- /dev/null +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/api.h" +#include "arrow/io/api.h" +#include "arrow/result.h" +#include "arrow/util/type_fwd.h" +#include "parquet/arrow/reader.h" +#include "parquet/arrow/writer.h" + +#include + +arrow::Status ReadFullFile(std::string path_to_file) { + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + std::shared_ptr input; + ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(path_to_file)); + + // Open Parquet file reader + std::unique_ptr arrow_reader; + ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader)); + + // Read entire file as a single Arrow table + std::shared_ptr table; + ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table)); + return arrow::Status::OK(); +} + +arrow::Status ReadInBatches(std::string path_to_file) { + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + // Configure general Parquet reader settings + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + // Configure Arrow-specific Parquet reader settings + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(128 * 1024); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + ARROW_RETURN_NOT_OK( + reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + std::unique_ptr arrow_reader; + ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader)); + + for (arrow::Result> maybe_batch : *rb_reader) { + // Operate on each batch... + } + return arrow::Status::OK(); +} + +arrow::Result> GetTable() { + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_x; + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 3, 5, 7, 1})); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); + + std::shared_ptr arr_y; + ARROW_RETURN_NOT_OK(builder.AppendValues({2, 4, 6, 8, 10})); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); + + auto schema = arrow::schema( + {arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32())}); + + return arrow::Table::Make(schema, {arr_x, arr_y}); +} + +arrow::Result> GetRBR() { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + auto reader = std::make_shared(table); + reader->set_chunksize(10); + return reader; +} + +arrow::Status WriteFullFile(std::string path_to_file) { + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, + /*chunk_size=*/3, props, arrow_props)); + return arrow::Status::OK(); +} + +arrow::Status WriteInBatches(std::string path_to_file) { + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + // Data is in RBR + std::shared_ptr batch_stream; + ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); + + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + // Create a writer + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::unique_ptr writer; + ARROW_ASSIGN_OR_RAISE( + writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), + arrow::default_memory_pool(), outfile, + props, arrow_props)); + + // Write each batch as a row_group + for (arrow::Result> maybe_batch : *batch_stream) { + ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(batch->schema(), {batch})); + ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); + } + + // Write file footer and close + ARROW_RETURN_NOT_OK(writer->Close()); + + return arrow::Status::OK(); +} + +arrow::Status RunExamples(std::string path_to_file) { + ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); + ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); + ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); + ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); + return arrow::Status::OK(); +} + +int main(int argc, char** argv) { + if (argc != 2) { + // Fake success for CI purposes. + return EXIT_SUCCESS; + } + + std::string path_to_file = argv[1]; + arrow::Status status = RunExamples(path_to_file); + + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/rapidjson_row_converter.cc b/cpp/examples/arrow/rapidjson_row_converter.cc index defa6de4610..3907e72121c 100644 --- a/cpp/examples/arrow/rapidjson_row_converter.cc +++ b/cpp/examples/arrow/rapidjson_row_converter.cc @@ -97,7 +97,7 @@ class RowBatchBuilder { for (int64_t i = 0; i < array.length(); ++i) { if (!array.IsNull(i)) { rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator()); - arrow::util::string_view value_view = array.Value(i); + std::string_view value_view = array.Value(i); rapidjson::Value value; value.SetString(value_view.data(), static_cast(value_view.size()), diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt index 420a7666f35..b98f725a4a4 100644 --- a/cpp/examples/minimal_build/CMakeLists.txt +++ b/cpp/examples/minimal_build/CMakeLists.txt @@ -24,10 +24,10 @@ option(ARROW_LINK_SHARED "Link to the Arrow shared library" ON) find_package(Arrow REQUIRED) if(NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) endif() -# We require a C++11 compliant compiler +# We require a C++17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) if(NOT DEFINED CMAKE_BUILD_TYPE) @@ -40,9 +40,7 @@ message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}") add_executable(arrow-example example.cc) if(ARROW_LINK_SHARED) - target_link_libraries(arrow-example PRIVATE arrow_shared) + target_link_libraries(arrow-example PRIVATE Arrow::arrow_shared) else() - set(THREADS_PREFER_PTHREAD_FLAG ON) - find_package(Threads REQUIRED) - target_link_libraries(arrow-example PRIVATE arrow_static Threads::Threads) + target_link_libraries(arrow-example PRIVATE Arrow::arrow_static) endif() diff --git a/cpp/examples/minimal_build/run_static.sh b/cpp/examples/minimal_build/run_static.sh index cf2a9912f50..619811d09ac 100755 --- a/cpp/examples/minimal_build/run_static.sh +++ b/cpp/examples/minimal_build/run_static.sh @@ -102,7 +102,7 @@ echo rm -rf $EXAMPLE_BUILD_DIR mkdir -p $EXAMPLE_BUILD_DIR -${CXX:-c++} \ +${CXX:-c++} -std=c++17 \ -o $EXAMPLE_BUILD_DIR/arrow-example \ $EXAMPLE_DIR/example.cc \ $(PKG_CONFIG_PATH=$ARROW_BUILD_DIR/lib/pkgconfig \ diff --git a/cpp/examples/parquet/parquet_arrow/CMakeLists.txt b/cpp/examples/parquet/parquet_arrow/CMakeLists.txt index 32f980060c9..84f9d16e408 100644 --- a/cpp/examples/parquet/parquet_arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet_arrow/CMakeLists.txt @@ -24,19 +24,22 @@ include(ExternalProject) include(FindPkgConfig) include(GNUInstallDirs) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") +option(PARQUET_LINK_SHARED "Link to the Parquet shared library" ON) -# This ensures that things like gnu++11 get passed correctly +# This ensures that things like -std=gnu++... get passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) endif() -# We require a C++11 compliant compiler +# We require a C++17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) # Look for installed packages the system -find_package(Arrow REQUIRED) find_package(Parquet REQUIRED) add_executable(parquet-arrow-example reader_writer.cc) -target_link_libraries(parquet-arrow-example parquet_shared arrow_shared) +if(PARQUET_LINK_SHARED) + target_link_libraries(parquet-arrow-example Parquet::parquet_shared) +else() + target_link_libraries(parquet-arrow-example Parquet::parquet_static) +endif() diff --git a/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc b/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc index 64ab7af4962..1f7246b7816 100644 --- a/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc +++ b/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc @@ -135,10 +135,10 @@ struct TestData { if (i % 2 == 0) return {}; return "Str #" + std::to_string(i); } - static arrow::util::string_view GetStringView(const int i) { + static std::string_view GetStringView(const int i) { static std::string string; string = "StringView #" + std::to_string(i); - return arrow::util::string_view(string); + return std::string_view(string); } static const char* GetCharPtr(const int i) { static std::string string; @@ -190,7 +190,7 @@ void WriteParquetFile() { os.SetMaxRowGroupSize(1000); for (auto i = 0; i < TestData::num_rows; ++i) { - // Output string using 3 different types: std::string, arrow::util::string_view and + // Output string using 3 different types: std::string, std::string_view and // const char *. switch (i % 3) { case 0: diff --git a/cpp/examples/tutorial_examples/CMakeLists.txt b/cpp/examples/tutorial_examples/CMakeLists.txt new file mode 100644 index 00000000000..ed399edbd60 --- /dev/null +++ b/cpp/examples/tutorial_examples/CMakeLists.txt @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 3.0) + +project(ArrowTutorialExamples) + +find_package(Arrow REQUIRED) + +get_filename_component(ARROW_CONFIG_PATH ${Arrow_CONFIG} DIRECTORY) +find_package(Parquet REQUIRED HINTS ${ARROW_CONFIG_PATH}) +find_package(ArrowDataset REQUIRED HINTS ${ARROW_CONFIG_PATH}) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wextra") + +set(CMAKE_BUILD_TYPE Release) + +message(STATUS "Arrow version: ${ARROW_VERSION}") +message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}") + +add_executable(arrow_example arrow_example.cc) +target_link_libraries(arrow_example PRIVATE Arrow::arrow_shared) + +add_executable(file_access_example file_access_example.cc) +target_link_libraries(file_access_example PRIVATE Arrow::arrow_shared + Parquet::parquet_shared) + +add_executable(compute_example compute_example.cc) +target_link_libraries(compute_example PRIVATE Arrow::arrow_shared) + +add_executable(dataset_example dataset_example.cc) +target_link_libraries(dataset_example PRIVATE Arrow::arrow_shared Parquet::parquet_shared + ArrowDataset::arrow_dataset_shared) diff --git a/cpp/examples/tutorial_examples/arrow_example.cc b/cpp/examples/tutorial_examples/arrow_example.cc new file mode 100644 index 00000000000..45994a46e10 --- /dev/null +++ b/cpp/examples/tutorial_examples/arrow_example.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// (Doc section: Basic Example) + +// (Doc section: Includes) +#include + +#include +// (Doc section: Includes) + +// (Doc section: RunMain Start) +arrow::Status RunMain() { + // (Doc section: RunMain Start) + // (Doc section: int8builder 1 Append) + // Builders are the main way to create Arrays in Arrow from existing values that are not + // on-disk. In this case, we'll make a simple array, and feed that in. + // Data types are important as ever, and there is a Builder for each compatible type; + // in this case, int8. + arrow::Int8Builder int8builder; + int8_t days_raw[5] = {1, 12, 17, 23, 28}; + // AppendValues, as called, puts 5 values from days_raw into our Builder object. + ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5)); + // (Doc section: int8builder 1 Append) + + // (Doc section: int8builder 1 Finish) + // We only have a Builder though, not an Array -- the following code pushes out the + // built up data into a proper Array. + std::shared_ptr days; + ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish()); + // (Doc section: int8builder 1 Finish) + + // (Doc section: int8builder 2) + // Builders clear their state every time they fill an Array, so if the type is the same, + // we can re-use the builder. We do that here for month values. + int8_t months_raw[5] = {1, 3, 5, 7, 1}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5)); + std::shared_ptr months; + ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish()); + // (Doc section: int8builder 2) + + // (Doc section: int16builder) + // Now that we change to int16, we use the Builder for that data type instead. + arrow::Int16Builder int16builder; + int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995}; + ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5)); + std::shared_ptr years; + ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish()); + // (Doc section: int16builder) + + // (Doc section: Schema) + // Now, we want a RecordBatch, which has columns and labels for said columns. + // This gets us to the 2d data structures we want in Arrow. + // These are defined by schema, which have fields -- here we get both those object types + // ready. + std::shared_ptr field_day, field_month, field_year; + std::shared_ptr schema; + + // Every field needs its name and data type. + field_day = arrow::field("Day", arrow::int8()); + field_month = arrow::field("Month", arrow::int8()); + field_year = arrow::field("Year", arrow::int16()); + + // The schema can be built from a vector of fields, and we do so here. + schema = arrow::schema({field_day, field_month, field_year}); + // (Doc section: Schema) + + // (Doc section: RBatch) + // With the schema and Arrays full of data, we can make our RecordBatch! Here, + // each column is internally contiguous. This is in opposition to Tables, which we'll + // see next. + std::shared_ptr rbatch; + // The RecordBatch needs the schema, length for columns, which all must match, + // and the actual data itself. + rbatch = arrow::RecordBatch::Make(schema, days->length(), {days, months, years}); + + std::cout << rbatch->ToString(); + // (Doc section: RBatch) + + // (Doc section: More Arrays) + // Now, let's get some new arrays! It'll be the same datatypes as above, so we re-use + // Builders. + int8_t days_raw2[5] = {6, 12, 3, 30, 22}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw2, 5)); + std::shared_ptr days2; + ARROW_ASSIGN_OR_RAISE(days2, int8builder.Finish()); + + int8_t months_raw2[5] = {5, 4, 11, 3, 2}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw2, 5)); + std::shared_ptr months2; + ARROW_ASSIGN_OR_RAISE(months2, int8builder.Finish()); + + int16_t years_raw2[5] = {1980, 2001, 1915, 2020, 1996}; + ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw2, 5)); + std::shared_ptr years2; + ARROW_ASSIGN_OR_RAISE(years2, int16builder.Finish()); + // (Doc section: More Arrays) + + // (Doc section: ArrayVector) + // ChunkedArrays let us have a list of arrays, which aren't contiguous + // with each other. First, we get a vector of arrays. + arrow::ArrayVector day_vecs{days, days2}; + // (Doc section: ArrayVector) + // (Doc section: ChunkedArray Day) + // Then, we use that to initialize a ChunkedArray, which can be used with other + // functions in Arrow! This is good, since having a normal vector of arrays wouldn't + // get us far. + std::shared_ptr day_chunks = + std::make_shared(day_vecs); + // (Doc section: ChunkedArray Day) + + // (Doc section: ChunkedArray Month Year) + // Repeat for months. + arrow::ArrayVector month_vecs{months, months2}; + std::shared_ptr month_chunks = + std::make_shared(month_vecs); + + // Repeat for years. + arrow::ArrayVector year_vecs{years, years2}; + std::shared_ptr year_chunks = + std::make_shared(year_vecs); + // (Doc section: ChunkedArray Month Year) + + // (Doc section: Table) + // A Table is the structure we need for these non-contiguous columns, and keeps them + // all in one place for us so we can use them as if they were normal arrays. + std::shared_ptr table; + table = arrow::Table::Make(schema, {day_chunks, month_chunks, year_chunks}, 10); + + std::cout << table->ToString(); + // (Doc section: Table) + + // (Doc section: Ret) + return arrow::Status::OK(); +} +// (Doc section: Ret) + +// (Doc section: Main) +int main() { + arrow::Status st = RunMain(); + if (!st.ok()) { + std::cerr << st << std::endl; + return 1; + } + return 0; +} + +// (Doc section: Main) +// (Doc section: Basic Example) diff --git a/cpp/examples/tutorial_examples/build_arrow.sh b/cpp/examples/tutorial_examples/build_arrow.sh new file mode 100755 index 00000000000..ec72a288c7b --- /dev/null +++ b/cpp/examples/tutorial_examples/build_arrow.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +NPROC=$(nproc) + +mkdir -p $ARROW_BUILD_DIR +pushd $ARROW_BUILD_DIR + +# Enable the CSV reader as it's used by the example third-party build +cmake /arrow/cpp \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_PARQUET=ON \ + -DARROW_JEMALLOC=OFF \ + $ARROW_CMAKE_OPTIONS + +make -j$NPROC +make install + +popd diff --git a/cpp/src/arrow/python/arrow-python-flight.pc.in b/cpp/examples/tutorial_examples/build_example.sh old mode 100644 new mode 100755 similarity index 72% rename from cpp/src/arrow/python/arrow-python-flight.pc.in rename to cpp/examples/tutorial_examples/build_example.sh index a98ad1fbf6c..a315755a597 --- a/cpp/src/arrow/python/arrow-python-flight.pc.in +++ b/cpp/examples/tutorial_examples/build_example.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,12 +16,12 @@ # specific language governing permissions and limitations # under the License. -prefix=@CMAKE_INSTALL_PREFIX@ -includedir=@ARROW_PKG_CONFIG_INCLUDEDIR@ -libdir=@ARROW_PKG_CONFIG_LIBDIR@ +set -ex -Name: Apache Arrow Python Flight -Description: Python integration library for Apache Arrow Flight -Version: @ARROW_VERSION@ -Requires: arrow-python arrow-flight -Libs: -L${libdir} -larrow_python_flight +mkdir -p $EXAMPLE_BUILD_DIR +pushd $EXAMPLE_BUILD_DIR + +cmake /io +make + +popd diff --git a/cpp/examples/tutorial_examples/compute_example.cc b/cpp/examples/tutorial_examples/compute_example.cc new file mode 100644 index 00000000000..3a65214c0ef --- /dev/null +++ b/cpp/examples/tutorial_examples/compute_example.cc @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// (Doc section: Compute Example) + +// (Doc section: Includes) +#include +#include + +#include +// (Doc section: Includes) + +// (Doc section: RunMain) +arrow::Status RunMain() { + // (Doc section: RunMain) + // (Doc section: Create Tables) + // Create a couple 32-bit integer arrays. + arrow::Int32Builder int32builder; + int32_t some_nums_raw[5] = {34, 624, 2223, 5654, 4356}; + ARROW_RETURN_NOT_OK(int32builder.AppendValues(some_nums_raw, 5)); + std::shared_ptr some_nums; + ARROW_ASSIGN_OR_RAISE(some_nums, int32builder.Finish()); + + int32_t more_nums_raw[5] = {75342, 23, 64, 17, 736}; + ARROW_RETURN_NOT_OK(int32builder.AppendValues(more_nums_raw, 5)); + std::shared_ptr more_nums; + ARROW_ASSIGN_OR_RAISE(more_nums, int32builder.Finish()); + + // Make a table out of our pair of arrays. + std::shared_ptr field_a, field_b; + std::shared_ptr schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::int32()); + + schema = arrow::schema({field_a, field_b}); + + std::shared_ptr table; + table = arrow::Table::Make(schema, {some_nums, more_nums}, 5); + // (Doc section: Create Tables) + + // (Doc section: Sum Datum Declaration) + // The Datum class is what all compute functions output to, and they can take Datums + // as inputs, as well. + arrow::Datum sum; + // (Doc section: Sum Datum Declaration) + // (Doc section: Sum Call) + // Here, we can use arrow::compute::Sum. This is a convenience function, and the next + // computation won't be so simple. However, using these where possible helps + // readability. + ARROW_ASSIGN_OR_RAISE(sum, arrow::compute::Sum({table->GetColumnByName("A")})); + // (Doc section: Sum Call) + // (Doc section: Sum Datum Type) + // Get the kind of Datum and what it holds -- this is a Scalar, with int64. + std::cout << "Datum kind: " << sum.ToString() + << " content type: " << sum.type()->ToString() << std::endl; + // (Doc section: Sum Datum Type) + // (Doc section: Sum Contents) + // Note that we explicitly request a scalar -- the Datum cannot simply give what it is, + // you must ask for the correct type. + std::cout << sum.scalar_as().value << std::endl; + // (Doc section: Sum Contents) + + // (Doc section: Add Datum Declaration) + arrow::Datum element_wise_sum; + // (Doc section: Add Datum Declaration) + // (Doc section: Add Call) + // Get element-wise sum of both columns A and B in our Table. Note that here we use + // CallFunction(), which takes the name of the function as the first argument. + ARROW_ASSIGN_OR_RAISE(element_wise_sum, arrow::compute::CallFunction( + "add", {table->GetColumnByName("A"), + table->GetColumnByName("B")})); + // (Doc section: Add Call) + // (Doc section: Add Datum Type) + // Get the kind of Datum and what it holds -- this is a ChunkedArray, with int32. + std::cout << "Datum kind: " << element_wise_sum.ToString() + << " content type: " << element_wise_sum.type()->ToString() << std::endl; + // (Doc section: Add Datum Type) + // (Doc section: Add Contents) + // This time, we get a ChunkedArray, not a scalar. + std::cout << element_wise_sum.chunked_array()->ToString() << std::endl; + // (Doc section: Add Contents) + + // (Doc section: Index Datum Declare) + // Use an options struct to set up searching for 2223 in column A (the third item). + arrow::Datum third_item; + // (Doc section: Index Datum Declare) + // (Doc section: IndexOptions Declare) + // An options struct is used in lieu of passing an arbitrary amount of arguments. + arrow::compute::IndexOptions index_options; + // (Doc section: IndexOptions Declare) + // (Doc section: IndexOptions Assign) + // We need an Arrow Scalar, not a raw value. + index_options.value = arrow::MakeScalar(2223); + // (Doc section: IndexOptions Assign) + // (Doc section: Index Call) + ARROW_ASSIGN_OR_RAISE( + third_item, arrow::compute::CallFunction("index", {table->GetColumnByName("A")}, + &index_options)); + // (Doc section: Index Call) + // (Doc section: Index Inspection) + // Get the kind of Datum and what it holds -- this is a Scalar, with int64 + std::cout << "Datum kind: " << third_item.ToString() + << " content type: " << third_item.type()->ToString() << std::endl; + // We get a scalar -- the location of 2223 in column A, which is 2 in 0-based indexing. + std::cout << third_item.scalar_as().value << std::endl; + // (Doc section: Index Inspection) + // (Doc section: Ret) + return arrow::Status::OK(); +} +// (Doc section: Ret) + +// (Doc section: Main) +int main() { + arrow::Status st = RunMain(); + if (!st.ok()) { + std::cerr << st << std::endl; + return 1; + } + return 0; +} +// (Doc section: Main) + +// (Doc section: Compute Example) diff --git a/cpp/examples/tutorial_examples/dataset_example.cc b/cpp/examples/tutorial_examples/dataset_example.cc new file mode 100644 index 00000000000..005cdc324d0 --- /dev/null +++ b/cpp/examples/tutorial_examples/dataset_example.cc @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// (Doc section: Dataset Example) + +// (Doc section: Includes) +#include +#include +// We use Parquet headers for setting up examples; they are not required for using +// datasets. +#include +#include + +#include +// (Doc section: Includes) + +// (Doc section: Helper Functions) +// Generate some data for the rest of this example. +arrow::Result> CreateTable() { + // This code should look familiar from the basic Arrow example, and is not the + // focus of this example. However, we need data to work on it, and this makes that! + auto schema = + arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64())}); + std::shared_ptr array_a; + std::shared_ptr array_b; + std::shared_ptr array_c; + arrow::NumericBuilder builder; + ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_a)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_b)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_c)); + return arrow::Table::Make(schema, {array_a, array_b, array_c}); +} + +// Set up a dataset by writing two Parquet files. +arrow::Result CreateExampleParquetDataset( + const std::shared_ptr& filesystem, + const std::string& root_path) { + // Much like CreateTable(), this is utility that gets us the dataset we'll be reading + // from. Don't worry, we also write a dataset in the example proper. + auto base_path = root_path + "parquet_dataset"; + ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path)); + // Create an Arrow Table + ARROW_ASSIGN_OR_RAISE(auto table, CreateTable()); + // Write it into two Parquet files + ARROW_ASSIGN_OR_RAISE(auto output, + filesystem->OpenOutputStream(base_path + "/data1.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048)); + ARROW_ASSIGN_OR_RAISE(output, + filesystem->OpenOutputStream(base_path + "/data2.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(5), arrow::default_memory_pool(), output, 2048)); + return base_path; +} + +arrow::Status PrepareEnv() { + // Get our environment prepared for reading, by setting up some quick writing. + ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable()) + std::shared_ptr setup_fs; + // Note this operates in the directory the executable is built in. + char setup_path[256]; + char* result = getcwd(setup_path, 256); + if (result == NULL) { + return arrow::Status::IOError("Fetching PWD failed."); + } + + ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path)); + ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, "")); + + return arrow::Status::OK(); +} +// (Doc section: Helper Functions) + +// (Doc section: RunMain) +arrow::Status RunMain() { + // (Doc section: RunMain) + // (Doc section: PrepareEnv) + ARROW_RETURN_NOT_OK(PrepareEnv()); + // (Doc section: PrepareEnv) + + // (Doc section: FileSystem Declare) + // First, we need a filesystem object, which lets us interact with our local + // filesystem starting at a given path. For the sake of simplicity, that'll be + // the current directory. + std::shared_ptr fs; + // (Doc section: FileSystem Declare) + + // (Doc section: FileSystem Init) + // Get the CWD, use it to make the FileSystem object. + char init_path[256]; + char* result = getcwd(init_path, 256); + if (result == NULL) { + return arrow::Status::IOError("Fetching PWD failed."); + } + ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path)); + // (Doc section: FileSystem Init) + + // (Doc section: FileSelector Declare) + // A file selector lets us actually traverse a multi-file dataset. + arrow::fs::FileSelector selector; + // (Doc section: FileSelector Declare) + // (Doc section: FileSelector Config) + selector.base_dir = "parquet_dataset"; + // Recursive is a safe bet if you don't know the nesting of your dataset. + selector.recursive = true; + // (Doc section: FileSelector Config) + // (Doc section: FileSystemFactoryOptions) + // Making an options object lets us configure our dataset reading. + arrow::dataset::FileSystemFactoryOptions options; + // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition + // schema. We won't set any other options, defaults are fine. + options.partitioning = arrow::dataset::HivePartitioning::MakeFactory(); + // (Doc section: FileSystemFactoryOptions) + // (Doc section: File Format Setup) + auto read_format = std::make_shared(); + // (Doc section: File Format Setup) + // (Doc section: FileSystemDatasetFactory Make) + // Now, we get a factory that will let us get our dataset -- we don't have the + // dataset yet! + ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make( + fs, selector, read_format, options)); + // (Doc section: FileSystemDatasetFactory Make) + // (Doc section: FileSystemDatasetFactory Finish) + // Now we build our dataset from the factory. + ARROW_ASSIGN_OR_RAISE(auto read_dataset, factory->Finish()); + // (Doc section: FileSystemDatasetFactory Finish) + // (Doc section: Dataset Fragments) + // Print out the fragments + ARROW_ASSIGN_OR_RAISE(auto fragments, read_dataset->GetFragments()); + for (const auto& fragment : fragments) { + std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl; + std::cout << "Partition expression: " + << (*fragment)->partition_expression().ToString() << std::endl; + } + // (Doc section: Dataset Fragments) + // (Doc section: Read Scan Builder) + // Scan dataset into a Table -- once this is done, you can do + // normal table things with it, like computation and printing. However, now you're + // also dedicated to being in memory. + ARROW_ASSIGN_OR_RAISE(auto read_scan_builder, read_dataset->NewScan()); + // (Doc section: Read Scan Builder) + // (Doc section: Read Scanner) + ARROW_ASSIGN_OR_RAISE(auto read_scanner, read_scan_builder->Finish()); + // (Doc section: Read Scanner) + // (Doc section: To Table) + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, read_scanner->ToTable()); + std::cout << table->ToString(); + // (Doc section: To Table) + + // (Doc section: TableBatchReader) + // Now, let's get a table out to disk as a dataset! + // We make a RecordBatchReader from our Table, then set up a scanner, which lets us + // go to a file. + std::shared_ptr write_dataset = + std::make_shared(table); + // (Doc section: TableBatchReader) + // (Doc section: WriteScanner) + auto write_scanner_builder = + arrow::dataset::ScannerBuilder::FromRecordBatchReader(write_dataset); + ARROW_ASSIGN_OR_RAISE(auto write_scanner, write_scanner_builder->Finish()) + // (Doc section: WriteScanner) + // (Doc section: Partition Schema) + // The partition schema determines which fields are used as keys for partitioning. + auto partition_schema = arrow::schema({arrow::field("a", arrow::utf8())}); + // (Doc section: Partition Schema) + // (Doc section: Partition Create) + // We'll use Hive-style partitioning, which creates directories with "key=value" + // pairs. + auto partitioning = + std::make_shared(partition_schema); + // (Doc section: Partition Create) + // (Doc section: Write Format) + // Now, we declare we'll be writing Parquet files. + auto write_format = std::make_shared(); + // (Doc section: Write Format) + // (Doc section: Write Options) + // This time, we make Options for writing, but do much more configuration. + arrow::dataset::FileSystemDatasetWriteOptions write_options; + // Defaults to start. + write_options.file_write_options = write_format->DefaultWriteOptions(); + // (Doc section: Write Options) + // (Doc section: Options FS) + // Use the filesystem we already have. + write_options.filesystem = fs; + // (Doc section: Options FS) + // (Doc section: Options Target) + // Write to the folder "write_dataset" in current directory. + write_options.base_dir = "write_dataset"; + // (Doc section: Options Target) + // (Doc section: Options Partitioning) + // Use the partitioning declared above. + write_options.partitioning = partitioning; + // (Doc section: Options Partitioning) + // (Doc section: Options Name Template) + // Define what the name for the files making up the dataset will be. + write_options.basename_template = "part{i}.parquet"; + // (Doc section: Options Name Template) + // (Doc section: Options File Behavior) + // Set behavior to overwrite existing data -- specifically, this lets this example + // be run more than once, and allows whatever code you have to overwrite what's there. + write_options.existing_data_behavior = + arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore; + // (Doc section: Options File Behavior) + // (Doc section: Write Dataset) + // Write to disk! + ARROW_RETURN_NOT_OK( + arrow::dataset::FileSystemDataset::Write(write_options, write_scanner)); + // (Doc section: Write Dataset) + // (Doc section: Ret) + return arrow::Status::OK(); +} +// (Doc section: Ret) +// (Doc section: Main) +int main() { + arrow::Status st = RunMain(); + if (!st.ok()) { + std::cerr << st << std::endl; + return 1; + } + return 0; +} +// (Doc section: Main) + +// (Doc section: Dataset Example) diff --git a/cpp/src/arrow/python/arrow-python.pc.in b/cpp/examples/tutorial_examples/docker-compose.yml similarity index 71% rename from cpp/src/arrow/python/arrow-python.pc.in rename to cpp/examples/tutorial_examples/docker-compose.yml index c077c7dc84c..90bdbcad3d8 100644 --- a/cpp/src/arrow/python/arrow-python.pc.in +++ b/cpp/examples/tutorial_examples/docker-compose.yml @@ -15,13 +15,15 @@ # specific language governing permissions and limitations # under the License. -prefix=@CMAKE_INSTALL_PREFIX@ -includedir=@ARROW_PKG_CONFIG_INCLUDEDIR@ -libdir=@ARROW_PKG_CONFIG_LIBDIR@ +version: '3.5' -Name: Apache Arrow Python -Description: Python integration library for Apache Arrow -Version: @ARROW_VERSION@ -Requires: arrow -Libs: -L${libdir} -larrow_python -Cflags: -I${includedir} -I@PYTHON_INCLUDE_DIRS@ +services: + tutorial: + build: + context: . + dockerfile: tutorial.dockerfile + volumes: + - ../../../:/arrow:delegated + - .:/io:delegated + command: + - "/io/run.sh" diff --git a/cpp/examples/tutorial_examples/file_access_example.cc b/cpp/examples/tutorial_examples/file_access_example.cc new file mode 100644 index 00000000000..fdc312ff421 --- /dev/null +++ b/cpp/examples/tutorial_examples/file_access_example.cc @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// (Doc section: File I/O) + +// (Doc section: Includes) +#include +#include +#include +#include +#include +#include + +#include +// (Doc section: Includes) + +// (Doc section: GenInitialFile) +arrow::Status GenInitialFile() { + // Make a couple 8-bit integer arrays and a 16-bit integer array -- just like + // basic Arrow example. + arrow::Int8Builder int8builder; + int8_t days_raw[5] = {1, 12, 17, 23, 28}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5)); + std::shared_ptr days; + ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish()); + + int8_t months_raw[5] = {1, 3, 5, 7, 1}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5)); + std::shared_ptr months; + ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish()); + + arrow::Int16Builder int16builder; + int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995}; + ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5)); + std::shared_ptr years; + ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish()); + + // Get a vector of our Arrays + std::vector> columns = {days, months, years}; + + // Make a schema to initialize the Table with + std::shared_ptr field_day, field_month, field_year; + std::shared_ptr schema; + + field_day = arrow::field("Day", arrow::int8()); + field_month = arrow::field("Month", arrow::int8()); + field_year = arrow::field("Year", arrow::int16()); + + schema = arrow::schema({field_day, field_month, field_year}); + // With the schema and data, create a Table + std::shared_ptr table; + table = arrow::Table::Make(schema, columns); + + // Write out test files in IPC, CSV, and Parquet for the example to use. + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.arrow")); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr ipc_writer, + arrow::ipc::MakeFileWriter(outfile, schema)); + ARROW_RETURN_NOT_OK(ipc_writer->WriteTable(*table)); + ARROW_RETURN_NOT_OK(ipc_writer->Close()); + + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.csv")); + ARROW_ASSIGN_OR_RAISE(auto csv_writer, + arrow::csv::MakeCSVWriter(outfile, table->schema())); + ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*table)); + ARROW_RETURN_NOT_OK(csv_writer->Close()); + + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.parquet")); + PARQUET_THROW_NOT_OK( + parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, 5)); + + return arrow::Status::OK(); +} +// (Doc section: GenInitialFile) + +// (Doc section: RunMain) +arrow::Status RunMain() { + // (Doc section: RunMain) + // (Doc section: Gen Files) + // Generate initial files for each format with a helper function -- don't worry, + // we'll also write a table in this example. + ARROW_RETURN_NOT_OK(GenInitialFile()); + // (Doc section: Gen Files) + + // (Doc section: ReadableFile Definition) + // First, we have to set up a ReadableFile object, which just lets us point our + // readers to the right data on disk. We'll be reusing this object, and rebinding + // it to multiple files throughout the example. + std::shared_ptr infile; + // (Doc section: ReadableFile Definition) + // (Doc section: Arrow ReadableFile Open) + // Get "test_in.arrow" into our file pointer + ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open( + "test_in.arrow", arrow::default_memory_pool())); + // (Doc section: Arrow ReadableFile Open) + // (Doc section: Arrow Read Open) + // Open up the file with the IPC features of the library, gives us a reader object. + ARROW_ASSIGN_OR_RAISE(auto ipc_reader, arrow::ipc::RecordBatchFileReader::Open(infile)); + // (Doc section: Arrow Read Open) + // (Doc section: Arrow Read) + // Using the reader, we can read Record Batches. Note that this is specific to IPC; + // for other formats, we focus on Tables, but here, RecordBatches are used. + std::shared_ptr rbatch; + ARROW_ASSIGN_OR_RAISE(rbatch, ipc_reader->ReadRecordBatch(0)); + // (Doc section: Arrow Read) + + // (Doc section: Arrow Write Open) + // Just like with input, we get an object for the output file. + std::shared_ptr outfile; + // Bind it to "test_out.arrow" + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.arrow")); + // (Doc section: Arrow Write Open) + // (Doc section: Arrow Writer) + // Set up a writer with the output file -- and the schema! We're defining everything + // here, loading to fire. + ARROW_ASSIGN_OR_RAISE(std::shared_ptr ipc_writer, + arrow::ipc::MakeFileWriter(outfile, rbatch->schema())); + // (Doc section: Arrow Writer) + // (Doc section: Arrow Write) + // Write the record batch. + ARROW_RETURN_NOT_OK(ipc_writer->WriteRecordBatch(*rbatch)); + // (Doc section: Arrow Write) + // (Doc section: Arrow Close) + // Specifically for IPC, the writer needs to be explicitly closed. + ARROW_RETURN_NOT_OK(ipc_writer->Close()); + // (Doc section: Arrow Close) + + // (Doc section: CSV Read Open) + // Bind our input file to "test_in.csv" + ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.csv")); + // (Doc section: CSV Read Open) + // (Doc section: CSV Table Declare) + std::shared_ptr csv_table; + // (Doc section: CSV Table Declare) + // (Doc section: CSV Reader Make) + // The CSV reader has several objects for various options. For now, we'll use defaults. + ARROW_ASSIGN_OR_RAISE( + auto csv_reader, + arrow::csv::TableReader::Make( + arrow::io::default_io_context(), infile, arrow::csv::ReadOptions::Defaults(), + arrow::csv::ParseOptions::Defaults(), arrow::csv::ConvertOptions::Defaults())); + // (Doc section: CSV Reader Make) + // (Doc section: CSV Read) + // Read the table. + ARROW_ASSIGN_OR_RAISE(csv_table, csv_reader->Read()) + // (Doc section: CSV Read) + + // (Doc section: CSV Write) + // Bind our output file to "test_out.csv" + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.csv")); + // The CSV writer has simpler defaults, review API documentation for more complex usage. + ARROW_ASSIGN_OR_RAISE(auto csv_writer, + arrow::csv::MakeCSVWriter(outfile, csv_table->schema())); + ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*csv_table)); + // Not necessary, but a safe practice. + ARROW_RETURN_NOT_OK(csv_writer->Close()); + // (Doc section: CSV Write) + + // (Doc section: Parquet Read Open) + // Bind our input file to "test_in.parquet" + ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.parquet")); + // (Doc section: Parquet Read Open) + // (Doc section: Parquet FileReader) + std::unique_ptr reader; + // (Doc section: Parquet FileReader) + // (Doc section: Parquet OpenFile) + // Note that Parquet's OpenFile() takes the reader by reference, rather than returning + // a reader. + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + // (Doc section: Parquet OpenFile) + + // (Doc section: Parquet Read) + std::shared_ptr parquet_table; + // Read the table. + PARQUET_THROW_NOT_OK(reader->ReadTable(&parquet_table)); + // (Doc section: Parquet Read) + + // (Doc section: Parquet Write) + // Parquet writing does not need a declared writer object. Just get the output + // file bound, then pass in the table, memory pool, output, and chunk size for + // breaking up the Table on-disk. + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.parquet")); + PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable( + *parquet_table, arrow::default_memory_pool(), outfile, 5)); + // (Doc section: Parquet Write) + // (Doc section: Return) + return arrow::Status::OK(); +} +// (Doc section: Return) + +// (Doc section: Main) +int main() { + arrow::Status st = RunMain(); + if (!st.ok()) { + std::cerr << st << std::endl; + return 1; + } + return 0; +} +// (Doc section: Main) +// (Doc section: File I/O) diff --git a/cpp/examples/tutorial_examples/run.sh b/cpp/examples/tutorial_examples/run.sh new file mode 100755 index 00000000000..ed319a9d327 --- /dev/null +++ b/cpp/examples/tutorial_examples/run.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +cd /io + +export ARROW_BUILD_DIR=/build/arrow +export EXAMPLE_BUILD_DIR=/build/example + +echo +echo "==" +echo "== Building Arrow C++ library" +echo "==" +echo + +./build_arrow.sh + +echo +echo "==" +echo "== Building example project using Arrow C++ library" +echo "==" +echo + +./build_example.sh + +echo +echo "==" +echo "== Running example project" +echo "==" +echo + +${EXAMPLE_BUILD_DIR}/arrow_example +${EXAMPLE_BUILD_DIR}/compute_example +${EXAMPLE_BUILD_DIR}/file_access_example +${EXAMPLE_BUILD_DIR}/dataset_example diff --git a/cpp/examples/tutorial_examples/tutorial.dockerfile b/cpp/examples/tutorial_examples/tutorial.dockerfile new file mode 100644 index 00000000000..9361fc5e81d --- /dev/null +++ b/cpp/examples/tutorial_examples/tutorial.dockerfile @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:focal + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + build-essential \ + cmake \ + pkg-config && \ + apt-get clean && rm -rf /var/lib/apt/lists* diff --git a/cpp/gdb_arrow.py b/cpp/gdb_arrow.py index af3dad9c087..6c3af1680bd 100644 --- a/cpp/gdb_arrow.py +++ b/cpp/gdb_arrow.py @@ -426,12 +426,17 @@ def value(self): class Variant: """ - A arrow::util::Variant<...>. + A `std::variant<...>`. """ def __init__(self, val): self.val = val - self.index = int(self.val['index_']) + try: + # libstdc++ internals + self.index = val['_M_index'] + except gdb.error: + # fallback for other C++ standard libraries + self.index = gdb.parse_and_eval(f"{for_evaluation(val)}.index()") try: self.value_type = self.val.type.template_argument(self.index) except RuntimeError: @@ -451,7 +456,7 @@ def value(self): class StdString: """ - A `std::string` (or possibly `string_view`) value. + A `std::string` (or possibly `std::string_view`) value. """ def __init__(self, val): @@ -2158,67 +2163,6 @@ def to_string(self): return f"arrow::Result<{data_type}>({inner})" -class StringViewPrinter: - """ - Pretty-printer for arrow::util::string_view. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - size = int(self.val['size_']) - if size == 0: - return f"arrow::util::string_view of size 0" - else: - data = bytes_literal(self.val['data_'], size) - return f"arrow::util::string_view of size {size}, {data}" - - -class OptionalPrinter: - """ - Pretty-printer for arrow::util::optional. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - data_type = self.val.type.template_argument(0) - # XXX We rely on internal details of our vendored optional - # implementation, as inlined methods may not be callable from gdb. - if not self.val['has_value_']: - inner = "nullopt" - else: - data_ptr = self.val['contained']['data'].address - assert data_ptr - inner = data_ptr.reinterpret_cast( - data_type.pointer()).dereference() - return f"arrow::util::optional<{data_type}>({inner})" - - -class VariantPrinter: - """ - Pretty-printer for arrow::util::Variant. - """ - - def __init__(self, name, val): - self.val = val - self.variant = Variant(val) - - def to_string(self): - if self.variant.value_type is None: - return "arrow::util::Variant (uninitialized or corrupt)" - type_desc = (f"arrow::util::Variant of index {self.variant.index} " - f"(actual type {self.variant.value_type})") - - value = self.variant.value - if value is None: - return (f"{type_desc}, unavailable value") - else: - return (f"{type_desc}, value {value}") - - class FieldPrinter: """ Pretty-printer for arrow::Field. @@ -2436,11 +2380,6 @@ def to_string(self): "arrow::SimpleTable": TablePrinter, "arrow::Status": StatusPrinter, "arrow::Table": TablePrinter, - "arrow::util::optional": OptionalPrinter, - "arrow::util::string_view": StringViewPrinter, - "arrow::util::Variant": VariantPrinter, - "nonstd::optional_lite::optional": OptionalPrinter, - "nonstd::sv_lite::basic_string_view": StringViewPrinter, } diff --git a/r/src/imports.cpp b/cpp/proto/substrait/extension_rels.proto similarity index 53% rename from r/src/imports.cpp rename to cpp/proto/substrait/extension_rels.proto index f4174bab5f4..ceed9f3e455 100644 --- a/r/src/imports.cpp +++ b/cpp/proto/substrait/extension_rels.proto @@ -14,30 +14,31 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +syntax = "proto3"; -#include // for R_GetCCallable -#include +package arrow.substrait_ext; -namespace vctrs { -struct vctrs_api_ptrs_t { - R_len_t (*short_vec_size)(SEXP x); +import "substrait/algebra.proto"; - vctrs_api_ptrs_t() { - short_vec_size = (R_len_t(*)(SEXP))R_GetCCallable("vctrs", "short_vec_size"); - } -}; +option csharp_namespace = "Arrow.Substrait"; +option go_package = "github.com/apache/arrow/substrait"; +option java_multiple_files = true; +option java_package = "io.arrow.substrait"; -const vctrs_api_ptrs_t& vctrs_api() { - static vctrs_api_ptrs_t ptrs; - return ptrs; -} +// As-Of-Join relation +message AsOfJoinRel { + // One key per input relation, each key describing how to join the corresponding input + repeated AsOfJoinKey keys = 1; + + // As-Of tolerance, in units of the on-key + int64 tolerance = 2; -R_len_t vec_size(SEXP x) { - if (Rf_inherits(x, "data.frame") || TYPEOF(x) != VECSXP || Rf_inherits(x, "POSIXlt")) { - return vctrs_api().short_vec_size(x); - } else { - return Rf_length(x); + // As-Of-Join key + message AsOfJoinKey { + // A field reference defining the on-key + .substrait.Expression on = 1; + + // A set of field references defining the by-key + repeated .substrait.Expression by = 2; } } - -} // namespace vctrs diff --git a/cpp/src/arrow/ArrowConfig.cmake.in b/cpp/src/arrow/ArrowConfig.cmake.in index 44c8a66f67d..cba7f23f6a2 100644 --- a/cpp/src/arrow/ArrowConfig.cmake.in +++ b/cpp/src/arrow/ArrowConfig.cmake.in @@ -25,8 +25,8 @@ # # This config sets the following targets in your project:: # -# arrow_shared - for linked as shared library if shared library is built -# arrow_static - for linked as static library if static library is built +# Arrow::arrow_shared - for linked as shared library if shared library is built +# Arrow::arrow_static - for linked as static library if static library is built @PACKAGE_INIT@ @@ -34,61 +34,166 @@ set(ARROW_VERSION "@ARROW_VERSION@") set(ARROW_SO_VERSION "@ARROW_SO_VERSION@") set(ARROW_FULL_SO_VERSION "@ARROW_FULL_SO_VERSION@") -set(ARROW_LIBRARY_PATH_SUFFIXES "@ARROW_LIBRARY_PATH_SUFFIXES@") +set(ARROW_BUNDLED_STATIC_LIBS "@ARROW_BUNDLED_STATIC_LIBS@") set(ARROW_INCLUDE_PATH_SUFFIXES "@ARROW_INCLUDE_PATH_SUFFIXES@") +set(ARROW_LIBRARY_PATH_SUFFIXES "@ARROW_LIBRARY_PATH_SUFFIXES@") set(ARROW_SYSTEM_DEPENDENCIES "@ARROW_SYSTEM_DEPENDENCIES@") -set(ARROW_BUNDLED_STATIC_LIBS "@ARROW_BUNDLED_STATIC_LIBS@") -set(ARROW_STATIC_INSTALL_INTERFACE_LIBS "@ARROW_STATIC_INSTALL_INTERFACE_LIBS@") include("${CMAKE_CURRENT_LIST_DIR}/ArrowOptions.cmake") -include(CMakeFindDependencyMacro) +if(ARROW_BUILD_STATIC) + include(CMakeFindDependencyMacro) -# Load targets only once. If we load targets multiple times, CMake reports -# already existent target error. -if(NOT (TARGET arrow_shared OR TARGET arrow_static)) - include("${CMAKE_CURRENT_LIST_DIR}/ArrowTargets.cmake") + set(CMAKE_THREAD_PREFER_PTHREAD TRUE) + set(THREADS_PREFER_PTHREAD_FLAG TRUE) + find_dependency(Threads) - if(TARGET arrow_static) - set(CMAKE_THREAD_PREFER_PTHREAD TRUE) - set(THREADS_PREFER_PTHREAD_FLAG TRUE) - find_dependency(Threads) + if(DEFINED CMAKE_MODULE_PATH) + set(ARROW_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) + else() + unset(ARROW_CMAKE_MODULE_PATH_OLD) + endif() + set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") - if(DEFINED CMAKE_MODULE_PATH) - set(_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) + foreach(_DEPENDENCY ${ARROW_SYSTEM_DEPENDENCIES}) + set(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE FALSE) + if(${_DEPENDENCY} STREQUAL "OpenSSL" AND NOT OPENSSL_ROOT_DIR) + find_program(ARROW_BREW brew) + if(ARROW_BREW) + set(ARROW_OPENSSL_ROOT_DIR_ORIGINAL ${OPENSSL_ROOT_DIR}) + execute_process(COMMAND ${ARROW_BREW} --prefix "openssl@1.1" + OUTPUT_VARIABLE OPENSSL11_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENSSL11_BREW_PREFIX) + set(OPENSSL_ROOT_DIR ${OPENSSL11_BREW_PREFIX}) + set(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE TRUE) + else() + execute_process(COMMAND ${ARROW_BREW} --prefix "openssl" + OUTPUT_VARIABLE OPENSSL_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(OPENSSL_BREW_PREFIX) + set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX}) + set(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE TRUE) + endif() + endif() + endif() + endif() + find_dependency(${_DEPENDENCY}) + if(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE) + set(OPENSSL_ROOT_DIR ${ARROW_OPENSSL_ROOT_DIR_ORIGINAL}) endif() - set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + endforeach() - foreach(_DEPENDENCY ${ARROW_SYSTEM_DEPENDENCIES}) - find_dependency(${_DEPENDENCY}) - endforeach() + if(DEFINED ARROW_CMAKE_MODULE_PATH_OLD) + set(CMAKE_MODULE_PATH ${ARROW_CMAKE_MODULE_PATH_OLD}) + unset(ARROW_CMAKE_MODULE_PATH_OLD) + else() + unset(CMAKE_MODULE_PATH) + endif() +endif() - if(DEFINED _CMAKE_MODULE_PATH_OLD) - set(CMAKE_MODULE_PATH ${_CMAKE_MODULE_PATH_OLD}) - unset(_CMAKE_MODULE_PATH_OLD) - else() - unset(CMAKE_MODULE_PATH) - endif() +include("${CMAKE_CURRENT_LIST_DIR}/ArrowTargets.cmake") + +if(TARGET Arrow::arrow_static AND NOT TARGET Arrow::arrow_bundled_dependencies) + add_library(Arrow::arrow_bundled_dependencies STATIC IMPORTED) + get_target_property(arrow_static_location Arrow::arrow_static LOCATION) + get_filename_component(arrow_lib_dir "${arrow_static_location}" DIRECTORY) + set_target_properties(Arrow::arrow_bundled_dependencies + PROPERTIES IMPORTED_LOCATION + "${arrow_lib_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) - get_property(arrow_static_loc TARGET arrow_static PROPERTY LOCATION) - get_filename_component(arrow_lib_dir ${arrow_static_loc} DIRECTORY) - - if(ARROW_BUNDLED_STATIC_LIBS) - add_library(arrow_bundled_dependencies STATIC IMPORTED) - set_target_properties( - arrow_bundled_dependencies - PROPERTIES - IMPORTED_LOCATION - "${arrow_lib_dir}/${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}" - INTERFACE_LINK_LIBRARIES - "${ARROW_STATIC_INSTALL_INTERFACE_LIBS}" - ) - get_property(arrow_static_interface_link_libraries - TARGET arrow_static - PROPERTY INTERFACE_LINK_LIBRARIES) - set_target_properties( - arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES - "${arrow_static_interface_link_libraries};arrow_bundled_dependencies") + # CMP0057: Support new if() IN_LIST operator. + # https://cmake.org/cmake/help/latest/policy/CMP0057.html + cmake_policy(PUSH) + cmake_policy(SET CMP0057 NEW) + if("AWS::aws-c-common" IN_LIST ARROW_BUNDLED_STATIC_LIBS) + if(APPLE) + find_library(CORE_FOUNDATION CoreFoundation) + target_link_libraries(Arrow::arrow_bundled_dependencies + INTERFACE ${CORE_FOUNDATION}) + elseif(WIN32) + target_link_libraries(Arrow::arrow_bundled_dependencies + INTERFACE "winhttp.lib" + "bcrypt.lib" + "wininet.lib" + "userenv.lib" + "version.lib") endif() endif() + cmake_policy(POP) endif() + +macro(arrow_keep_backward_compatibility namespace target_base_name) + string(TOUPPER ${target_base_name} target_base_name_upper) + + if(NOT CMAKE_VERSION VERSION_LESS 3.18) + if(TARGET ${namespace}::${target_base_name}_shared AND NOT TARGET + ${target_base_name}_shared) + add_library(${target_base_name}_shared ALIAS + ${namespace}::${target_base_name}_shared) + endif() + if(TARGET ${namespace}::${target_base_name}_static AND NOT TARGET + ${target_base_name}_static) + add_library(${target_base_name}_static ALIAS + ${namespace}::${target_base_name}_static) + endif() + endif() + + if(TARGET ${namespace}::${target_base_name}_shared) + get_target_property(${target_base_name_upper}_INCLUDE_DIR + ${namespace}::${target_base_name}_shared + INTERFACE_INCLUDE_DIRECTORIES) + else() + get_target_property(${target_base_name_upper}_INCLUDE_DIR + ${namespace}::${target_base_name}_static + INTERFACE_INCLUDE_DIRECTORIES) + endif() + + foreach(BUILD_TYPE_SUFFIX + "_RELEASE" + "_RELWITHDEBINFO" + "_MINSIZEREL" + "_DEBUG" + "") + if(TARGET ${namespace}::${target_base_name}_shared) + if(NOT ${target_base_name_upper}_SHARED_LIB) + get_target_property(${target_base_name_upper}_SHARED_LIB + ${namespace}::${target_base_name}_shared + IMPORTED_LOCATION${BUILD_TYPE_SUFFIX}) + endif() + if(NOT ${target_base_name_upper}_IMPORT_LIB) + get_target_property(${target_base_name_upper}_IMPORT_LIB + ${namespace}::${target_base_name}_shared + IMPORTED_IMPLIB${BUILD_TYPE_SUFFIX}) + endif() + endif() + + if(TARGET ${namespace}::${target_base_name}_static) + if(NOT ${target_base_name_upper}_STATIC_LIB) + get_target_property(${target_base_name_upper}_STATIC_LIB + ${namespace}::${target_base_name}_static + IMPORTED_LOCATION${BUILD_TYPE_SUFFIX}) + endif() + endif() + endforeach() +endmacro() + +arrow_keep_backward_compatibility(Arrow arrow) + +check_required_components(Arrow) + +macro(arrow_show_details package_name variable_prefix) + if(NOT ${package_name}_FIND_QUIETLY AND NOT ${package_name}_SHOWED_DETAILS) + message(STATUS "${package_name} version: ${${package_name}_VERSION}") + message(STATUS "Found the ${package_name} shared library: ${${variable_prefix}_SHARED_LIB}" + ) + message(STATUS "Found the ${package_name} import library: ${${variable_prefix}_IMPORT_LIB}" + ) + message(STATUS "Found the ${package_name} static library: ${${variable_prefix}_STATIC_LIB}" + ) + set(${package_name}_SHOWED_DETAILS TRUE) + endif() +endmacro() + +arrow_show_details(Arrow ARROW) diff --git a/cpp/src/arrow/ArrowTestingConfig.cmake.in b/cpp/src/arrow/ArrowTestingConfig.cmake.in index 2b5548c8b1a..87ee9e755e1 100644 --- a/cpp/src/arrow/ArrowTestingConfig.cmake.in +++ b/cpp/src/arrow/ArrowTestingConfig.cmake.in @@ -21,16 +21,18 @@ # # This config sets the following targets in your project:: # -# arrow_testing_shared - for linked as shared library if shared library is built -# arrow_testing_static - for linked as static library if static library is built +# ArrowTesting::arrow_testing_shared - for linked as shared library if shared library is built +# ArrowTesting::arrow_testing_static - for linked as static library if static library is built @PACKAGE_INIT@ include(CMakeFindDependencyMacro) find_dependency(Arrow) -# Load targets only once. If we load targets multiple times, CMake reports -# already existent target error. -if(NOT (TARGET arrow_testing_shared OR TARGET arrow_testing_static)) - include("${CMAKE_CURRENT_LIST_DIR}/ArrowTestingTargets.cmake") -endif() +include("${CMAKE_CURRENT_LIST_DIR}/ArrowTestingTargets.cmake") + +arrow_keep_backward_compatibility(ArrowTesting arrow_testing) + +check_required_components(ArrowTesting) + +arrow_show_details(ArrowTesting ARROW_TESTING) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 5070d22fc55..90ab1e6ac27 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -192,6 +192,7 @@ set(ARROW_SRCS io/stdio.cc io/transform.cc util/async_util.cc + util/atfork_internal.cc util/basic_decimal.cc util/bit_block_counter.cc util/bit_run_reader.cc @@ -397,10 +398,12 @@ if(ARROW_COMPUTE) compute/exec/hash_join_node.cc compute/exec/key_hash.cc compute/exec/key_map.cc + compute/exec/map_node.cc compute/exec/order_by_impl.cc compute/exec/partition_util.cc compute/exec/options.cc compute/exec/project_node.cc + compute/exec/query_context.cc compute/exec/sink_node.cc compute/exec/source_node.cc compute/exec/swiss_join.cc @@ -425,6 +428,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_boolean.cc compute/kernels/scalar_cast_boolean.cc compute/kernels/scalar_cast_dictionary.cc + compute/kernels/scalar_cast_extension.cc compute/kernels/scalar_cast_internal.cc compute/kernels/scalar_cast_nested.cc compute/kernels/scalar_cast_numeric.cc @@ -434,6 +438,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_if_else.cc compute/kernels/scalar_nested.cc compute/kernels/scalar_random.cc + compute/kernels/scalar_round.cc compute/kernels/scalar_set_lookup.cc compute/kernels/scalar_string_ascii.cc compute/kernels/scalar_string_utf8.cc @@ -445,7 +450,9 @@ if(ARROW_COMPUTE) compute/kernels/vector_cumulative_ops.cc compute/kernels/vector_hash.cc compute/kernels/vector_nested.cc + compute/kernels/vector_rank.cc compute/kernels/vector_replace.cc + compute/kernels/vector_select_k.cc compute/kernels/vector_selection.cc compute/kernels/vector_sort.cc compute/row/encode_internal.cc @@ -551,12 +558,47 @@ else() endif() if(ARROW_BUILD_BUNDLED_DEPENDENCIES) + arrow_car(_FIRST_LIB ${ARROW_BUNDLED_STATIC_LIBS}) + arrow_cdr(_OTHER_LIBS ${ARROW_BUNDLED_STATIC_LIBS}) + arrow_create_merged_static_lib(arrow_bundled_dependencies + NAME + arrow_bundled_dependencies + ROOT + ${_FIRST_LIB} + TO_MERGE + ${_OTHER_LIBS}) + # We can't use install(TARGETS) here because + # arrow_bundled_dependencies is an IMPORTED library. + get_target_property(arrow_bundled_dependencies_path arrow_bundled_dependencies + IMPORTED_LOCATION) + install(FILES ${arrow_bundled_dependencies_path} ${INSTALL_IS_OPTIONAL} + DESTINATION ${CMAKE_INSTALL_LIBDIR}) string(APPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies") + list(INSERT ARROW_STATIC_INSTALL_INTERFACE_LIBS 0 "Arrow::arrow_bundled_dependencies") endif() # Need -latomic on Raspbian. # See also: https://issues.apache.org/jira/browse/ARROW-12860 if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") string(APPEND ARROW_PC_LIBS_PRIVATE " -latomic") + list(APPEND ARROW_SHARED_INSTALL_INTERFACE_LIBS "atomic") + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS "atomic") +endif() + +# If libarrow.a is only built, "pkg-config --cflags --libs arrow" +# outputs build flags for static linking not shared +# linking. ARROW_PC_* except ARROW_PC_*_PRIVATE are for the static +# linking case. +if(NOT ARROW_BUILD_SHARED AND ARROW_BUILD_STATIC) + set(ARROW_PC_CFLAGS "${ARROW_PC_CFLAGS_PRIVATE}") + set(ARROW_PC_CFLAGS_PRIVATE "") + set(ARROW_PC_LIBS "${ARROW_PC_LIBS_PRIVATE}") + set(ARROW_PC_LIBS_PRIVATE "") + set(ARROW_PC_REQUIRES "${ARROW_PC_REQUIRES_PRIVATE}") + set(ARROW_PC_REQUIRES_PRIVATE "") +else() + set(ARROW_PC_CFLAGS "") + set(ARROW_PC_LIBS "") + set(ARROW_PC_REQUIRES "") endif() add_arrow_lib(arrow @@ -575,11 +617,12 @@ add_arrow_lib(arrow SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} SHARED_LINK_LIBS - ${ARROW_LINK_LIBS} + ${ARROW_SHARED_LINK_LIBS} SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} + STATIC_INSTALL_INTERFACE_LIBS ${ARROW_STATIC_INSTALL_INTERFACE_LIBS} SHARED_INSTALL_INTERFACE_LIBS ${ARROW_SHARED_INSTALL_INTERFACE_LIBS}) @@ -619,6 +662,8 @@ endif() foreach(LIB_TARGET ${ARROW_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) + # C++17 is required to compile against Arrow C++ headers and libraries + target_compile_features(${LIB_TARGET} PUBLIC cxx_std_17) endforeach() if(ARROW_WITH_BACKTRACE) @@ -631,18 +676,6 @@ if(ARROW_WITH_BACKTRACE) endforeach() endif() -if(ARROW_BUILD_BUNDLED_DEPENDENCIES) - arrow_car(_FIRST_LIB ${ARROW_BUNDLED_STATIC_LIBS}) - arrow_cdr(_OTHER_LIBS ${ARROW_BUNDLED_STATIC_LIBS}) - create_merged_static_lib(arrow_bundled_dependencies - NAME - arrow_bundled_dependencies - ROOT - ${_FIRST_LIB} - TO_MERGE - ${_OTHER_LIBS}) -endif() - if(ARROW_TESTING) # that depend on gtest add_arrow_lib(arrow_testing @@ -663,10 +696,16 @@ if(ARROW_TESTING) rapidjson::rapidjson arrow_shared GTest::gtest + SHARED_INSTALL_INTERFACE_LIBS + Arrow::arrow_shared + GTest::gtest STATIC_LINK_LIBS arrow::flatbuffers rapidjson::rapidjson arrow_static + GTest::gtest + STATIC_INSTALL_INTERFACE_LIBS + Arrow::arrow_static GTest::gtest) add_custom_target(arrow_testing) @@ -685,11 +724,11 @@ arrow_install_all_headers("arrow") config_summary_cmake_setters("${CMAKE_CURRENT_BINARY_DIR}/ArrowOptions.cmake") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ArrowOptions.cmake - DESTINATION "${ARROW_CMAKE_DIR}") + DESTINATION "${ARROW_CMAKE_DIR}/Arrow") # For backward compatibility for find_package(arrow) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/arrow-config.cmake - DESTINATION "${ARROW_CMAKE_DIR}") + DESTINATION "${ARROW_CMAKE_DIR}/Arrow") # # Unit tests @@ -805,10 +844,6 @@ if(ARROW_ORC) add_subdirectory(adapters/orc) endif() -if(ARROW_PYTHON) - add_subdirectory(python) -endif() - if(ARROW_TENSORFLOW) add_subdirectory(adapters/tensorflow) endif() diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt index d7cc6524bc9..3c695abb5a0 100644 --- a/cpp/src/arrow/adapters/orc/CMakeLists.txt +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -26,27 +26,14 @@ install(FILES adapter.h options.h # pkg-config support arrow_add_pkg_config("arrow-orc") -set(ORC_MIN_TEST_LIBS - GTest::gtest_main - GTest::gtest - ${Snappy_TARGET} - lz4::lz4 - ZLIB::ZLIB) - if(ARROW_BUILD_STATIC) set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_static arrow_static) else() set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_shared arrow_shared) endif() -if(APPLE) - set(ORC_MIN_TEST_LIBS ${ORC_MIN_TEST_LIBS} ${CMAKE_DL_LIBS}) -elseif(NOT MSVC) - set(ORC_MIN_TEST_LIBS ${ORC_MIN_TEST_LIBS} pthread ${CMAKE_DL_LIBS}) -endif() - set(ORC_STATIC_TEST_LINK_LIBS orc::liborc ${ARROW_LIBRARIES_FOR_STATIC_TESTS} - ${ORC_MIN_TEST_LIBS}) + GTest::gtest_main GTest::gtest) add_arrow_test(adapter_test PREFIX diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 5af5ebccc84..d4e379a93b4 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -126,13 +126,6 @@ class ArrowInputFile : public liborc::InputStream { std::shared_ptr file_; }; -struct StripeInformation { - uint64_t offset; - uint64_t length; - uint64_t num_rows; - uint64_t first_row_of_stripe; -}; - // The number of rows to read in a ColumnVectorBatch constexpr int64_t kReadRowsBatch = 1000; @@ -206,8 +199,10 @@ class ORCFileReader::Impl { uint64_t first_row_of_stripe = 0; for (int i = 0; i < nstripes; ++i) { stripe = reader_->getStripe(i); - stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(), - stripe->getNumberOfRows(), first_row_of_stripe}); + stripes_[i] = StripeInformation({static_cast(stripe->getOffset()), + static_cast(stripe->getLength()), + static_cast(stripe->getNumberOfRows()), + static_cast(first_row_of_stripe)}); first_row_of_stripe += stripe->getNumberOfRows(); } return Status::OK(); @@ -217,6 +212,8 @@ class ORCFileReader::Impl { int64_t NumberOfRows() { return static_cast(reader_->getNumberOfRows()); } + StripeInformation GetStripeInformation(int64_t stripe) { return stripes_[stripe]; } + FileVersion GetFileVersion() { liborc::FileVersion orc_file_version = reader_->getFormatVersion(); return FileVersion(orc_file_version.getMajor(), orc_file_version.getMinor()); @@ -383,7 +380,8 @@ class ORCFileReader::Impl { ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(), Status::Invalid("Out of bounds stripe: ", stripe)); - opts->range(stripes_[stripe].offset, stripes_[stripe].length); + opts->range(static_cast(stripes_[stripe].offset), + static_cast(stripes_[stripe].length)); return Status::OK(); } @@ -393,9 +391,9 @@ class ORCFileReader::Impl { Status::Invalid("Out of bounds row number: ", row_number)); for (auto it = stripes_.begin(); it != stripes_.end(); it++) { - if (static_cast(row_number) >= it->first_row_of_stripe && - static_cast(row_number) < it->first_row_of_stripe + it->num_rows) { - opts->range(it->offset, it->length); + if (row_number >= it->first_row_id && + row_number < it->first_row_id + it->num_rows) { + opts->range(static_cast(it->offset), static_cast(it->length)); *out = *it; return Status::OK(); } @@ -411,7 +409,7 @@ class ORCFileReader::Impl { ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } - opts->includeTypes(include_indices_list); + opts->include(include_indices_list); return Status::OK(); } @@ -427,7 +425,8 @@ class ORCFileReader::Impl { liborc::RowReaderOptions opts(row_opts); std::vector> batches(stripes_.size()); for (size_t stripe = 0; stripe < stripes_.size(); stripe++) { - opts.range(stripes_[stripe].offset, stripes_[stripe].length); + opts.range(static_cast(stripes_[stripe].offset), + static_cast(stripes_[stripe].length)); ARROW_ASSIGN_OR_RAISE(batches[stripe], ReadBatch(opts, schema, stripes_[stripe].num_rows)); } @@ -488,7 +487,7 @@ class ORCFileReader::Impl { ORC_BEGIN_CATCH_NOT_OK row_reader = reader_->createRowReader(opts); row_reader->seekToRow(current_row_); - current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; + current_row_ = stripe_info.first_row_id + stripe_info.num_rows; ORC_END_CATCH_NOT_OK return std::make_shared(std::move(row_reader), schema, batch_size, @@ -600,6 +599,10 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } +StripeInformation ORCFileReader::GetStripeInformation(int64_t stripe) { + return impl_->GetStripeInformation(stripe); +} + FileVersion ORCFileReader::GetFileVersion() { return impl_->GetFileVersion(); } std::string ORCFileReader::GetSoftwareVersion() { return impl_->GetSoftwareVersion(); } @@ -727,12 +730,23 @@ class ORCFileWriter::Impl { } Status Write(const Table& table) { - ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema()))); - ARROW_ASSIGN_OR_RAISE(auto orc_options, MakeOrcWriterOptions(write_options_)); + if (!writer_.get()) { + ARROW_ASSIGN_OR_RAISE(orc_schema_, GetOrcType(*(table.schema()))); + ARROW_ASSIGN_OR_RAISE(auto orc_options, MakeOrcWriterOptions(write_options_)); + arrow_schema_ = table.schema(); + ORC_CATCH_NOT_OK( + writer_ = liborc::createWriter(*orc_schema_, out_stream_.get(), orc_options)) + } else { + bool schemas_matching = table.schema()->Equals(arrow_schema_, false); + if (!schemas_matching) { + return Status::TypeError( + "The schema of the RecordBatch does not match" + " the initial schema. All exported RecordBatches/Tables" + " must have the same schema.\nInitial:\n", + *arrow_schema_, "\nCurrent:\n", *table.schema()); + } + } auto batch_size = static_cast(write_options_.batch_size); - ORC_CATCH_NOT_OK( - writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), orc_options)) - int64_t num_rows = table.num_rows(); const int num_cols = table.num_columns(); std::vector arrow_index_offset(num_cols, 0); @@ -744,7 +758,7 @@ class ORCFileWriter::Impl { while (num_rows > 0) { for (int i = 0; i < num_cols; i++) { RETURN_NOT_OK(adapters::orc::WriteBatch( - *(table.column(i)), batch_size, &(arrow_chunk_offset[i]), + *table.column(i), batch_size, &(arrow_chunk_offset[i]), &(arrow_index_offset[i]), (root->fields)[i])); } root->numElements = (root->fields)[0]->numElements; @@ -765,7 +779,9 @@ class ORCFileWriter::Impl { private: std::unique_ptr writer_; std::unique_ptr out_stream_; + std::shared_ptr arrow_schema_; WriteOptions write_options_; + ORC_UNIQUE_PTR orc_schema_; }; ORCFileWriter::~ORCFileWriter() {} @@ -783,6 +799,11 @@ Result> ORCFileWriter::Open( Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); } +Status ORCFileWriter::Write(const RecordBatch& record_batch) { + auto table = Table::Make(record_batch.schema(), record_batch.columns()); + return impl_->Write(*table); +} + Status ORCFileWriter::Close() { return impl_->Close(); } } // namespace orc diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 59f63796bd0..013be78600a 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -35,6 +35,18 @@ namespace arrow { namespace adapters { namespace orc { +/// \brief Information about an ORC stripe +struct StripeInformation { + /// \brief Offset of the stripe from the start of the file, in bytes + int64_t offset; + /// \brief Length of the stripe, in bytes + int64_t length; + /// \brief Number of rows in the stripe + int64_t num_rows; + /// \brief Index of the first row of the stripe + int64_t first_row_id; +}; + /// \class ORCFileReader /// \brief Read an Arrow Table or RecordBatch from an ORC file. class ARROW_EXPORT ORCFileReader { @@ -168,6 +180,9 @@ class ARROW_EXPORT ORCFileReader { /// \brief The number of rows in the file int64_t NumberOfRows(); + /// \brief StripeInformation for each stripe. + StripeInformation GetStripeInformation(int64_t stripe); + /// \brief Get the format version of the file. /// Currently known values are 0.11 and 0.12. /// @@ -272,12 +287,24 @@ class ARROW_EXPORT ORCFileWriter { io::OutputStream* output_stream, const WriteOptions& write_options = WriteOptions()); - /// \brief Write a table + /// \brief Write a table. This can be called multiple times. + /// + /// Tables passed in subsequent calls must match the schema of the table that was + /// written first. /// - /// \param[in] table the Arrow table from which data is extracted + /// \param[in] table the Arrow table from which data is extracted. /// \return Status Status Write(const Table& table); + /// \brief Write a RecordBatch. This can be called multiple times. + /// + /// RecordBatches passed in subsequent calls must match the schema of the + /// RecordBatch that was written first. + /// + /// \param[in] record_batch the Arrow RecordBatch from which data is extracted. + /// \return Status + Status Write(const RecordBatch& record_batch); + /// \brief Close an ORC writer (orc::Writer) /// /// \return Status diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 6914d6b9c18..c119e5cbeb8 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include @@ -42,22 +43,22 @@ namespace arrow { using internal::checked_pointer_cast; -constexpr int kDefaultSmallMemStreamSize = 16384 * 5; // 80KB -constexpr int kDefaultMemStreamSize = 10 * 1024 * 1024; +constexpr size_t kDefaultSmallMemStreamSize = 16384 * 5; // 80KB +constexpr size_t kDefaultMemStreamSize = 10 * 1024 * 1024; constexpr int64_t kNanoMax = std::numeric_limits::max(); constexpr int64_t kNanoMin = std::numeric_limits::lowest(); -const int64_t kMicroMax = std::floor(kNanoMax / 1000); -const int64_t kMicroMin = std::ceil(kNanoMin / 1000); -const int64_t kMilliMax = std::floor(kMicroMax / 1000); -const int64_t kMilliMin = std::ceil(kMicroMin / 1000); -const int64_t kSecondMax = std::floor(kMilliMax / 1000); -const int64_t kSecondMin = std::ceil(kMilliMin / 1000); +const int64_t kMicroMax = static_cast(std::floor(kNanoMax / 1000)); +const int64_t kMicroMin = static_cast(std::ceil(kNanoMin / 1000)); +const int64_t kMilliMax = static_cast(std::floor(kMicroMax / 1000)); +const int64_t kMilliMin = static_cast(std::ceil(kMicroMin / 1000)); +const int64_t kSecondMax = static_cast(std::floor(kMilliMax / 1000)); +const int64_t kSecondMin = static_cast(std::ceil(kMilliMin / 1000)); static constexpr random::SeedType kRandomSeed = 0x0ff1ce; class MemoryOutputStream : public liborc::OutputStream { public: - explicit MemoryOutputStream(ssize_t capacity) + explicit MemoryOutputStream(size_t capacity) : data_(capacity), name_("MemoryOutputStream"), length_(0) {} uint64_t getLength() const override { return length_; } @@ -86,12 +87,13 @@ class MemoryOutputStream : public liborc::OutputStream { std::shared_ptr GenerateFixedDifferenceBuffer(int32_t fixed_length, int64_t length) { BufferBuilder builder; - int32_t offsets[length]; + std::vector offsets; + offsets.resize(length); ARROW_EXPECT_OK(builder.Resize(4 * length)); - for (int32_t i = 0; i < length; i++) { - offsets[i] = fixed_length * i; + for (int64_t i = 0; i < length; i++) { + offsets[i] = static_cast(fixed_length * i); } - ARROW_EXPECT_OK(builder.Append(offsets, 4 * length)); + ARROW_EXPECT_OK(builder.Append(offsets.data(), 4 * length)); std::shared_ptr buffer; ARROW_EXPECT_OK(builder.Finish(&buffer)); return buffer; @@ -173,7 +175,7 @@ void RandWeakComposition(int64_t n, T sum, std::vector* out) { return static_cast(res); }); (*out)[n - 1] += remaining_sum; - std::random_shuffle(out->begin(), out->end()); + std::shuffle(out->begin(), out->end(), gen); } std::shared_ptr GenerateRandomChunkedArray( @@ -223,9 +225,10 @@ std::shared_ptr GenerateRandomTable(const std::shared_ptr& schema return Table::Make(schema, cv); } -void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, +void AssertTableWriteReadEqual(const std::vector>& input_tables, const std::shared_ptr
& expected_output_table, - const int64_t max_size = kDefaultSmallMemStreamSize) { + const int64_t max_size = kDefaultSmallMemStreamSize, + std::vector* opt_selected_read_indices = nullptr) { EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(max_size)); auto write_options = adapters::orc::WriteOptions(); @@ -239,7 +242,46 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, write_options.row_index_stride = 5000; EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( buffer_output_stream.get(), write_options)); - ARROW_EXPECT_OK(writer->Write(*input_table)); + for (const auto& input_table : input_tables) { + ARROW_EXPECT_OK(writer->Write(*input_table)); + } + ARROW_EXPECT_OK(writer->Close()); + EXPECT_OK_AND_ASSIGN(auto buffer, buffer_output_stream->Finish()); + std::shared_ptr in_stream(new io::BufferReader(buffer)); + EXPECT_OK_AND_ASSIGN( + auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool())); + ASSERT_EQ(reader->GetFileVersion(), write_options.file_version); + ASSERT_EQ(reader->GetCompression(), write_options.compression); + ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size); + ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride); + EXPECT_OK_AND_ASSIGN(auto actual_output_table, + opt_selected_read_indices == nullptr + ? reader->Read() + : reader->Read(*opt_selected_read_indices)); + ASSERT_OK(actual_output_table->ValidateFull()); + AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); +} + +void AssertBatchWriteReadEqual( + const std::vector>& input_batches, + const std::shared_ptr
& expected_output_table, + const int64_t max_size = kDefaultSmallMemStreamSize) { + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(max_size)); + auto write_options = adapters::orc::WriteOptions(); +#ifdef ARROW_WITH_SNAPPY + write_options.compression = Compression::SNAPPY; +#else + write_options.compression = Compression::UNCOMPRESSED; +#endif + write_options.file_version = adapters::orc::FileVersion(0, 11); + write_options.compression_block_size = 32768; + write_options.row_index_stride = 5000; + EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( + buffer_output_stream.get(), write_options)); + for (auto& input_batch : input_batches) { + ARROW_EXPECT_OK(writer->Write(*input_batch)); + } ARROW_EXPECT_OK(writer->Close()); EXPECT_OK_AND_ASSIGN(auto buffer, buffer_output_stream->Finish()); std::shared_ptr in_stream(new io::BufferReader(buffer)); @@ -253,6 +295,15 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } +void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, + const std::shared_ptr
& expected_output_table, + const int64_t max_size = kDefaultSmallMemStreamSize, + std::vector* opt_selected_read_indices = nullptr) { + std::vector> input_tables; + input_tables.push_back(input_table); + AssertTableWriteReadEqual(input_tables, expected_output_table, max_size, + opt_selected_read_indices); +} void AssertArrayWriteReadEqual(const std::shared_ptr& input_array, const std::shared_ptr& expected_output_array, const int64_t max_size = kDefaultSmallMemStreamSize) { @@ -341,6 +392,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { ASSERT_TRUE(metadata->Equals(*expected_metadata)); ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows()); ASSERT_EQ(stripe_count, reader->NumberOfStripes()); + ASSERT_EQ(static_cast(stripe_row_count), + reader->GetStripeInformation(0).num_rows); + ASSERT_EQ(static_cast(reader->NumberOfRows() - stripe_row_count), + reader->GetStripeInformation(stripe_count - 1).first_row_id); accumulated = 0; EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(reader_batch_size)); while (stripe_reader) { @@ -450,6 +505,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { std::shared_ptr
table = TableFromJSON(table_schema, {}); AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); } +TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) { + std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); + std::shared_ptr schema_selected = + schema({field("int8", int8()), field("int32", int32())}); + std::shared_ptr
table_selected = TableFromJSON(schema_selected, {R"([])"}); + std::vector selected_indices = {1, 3}; + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16, + &selected_indices); +} +TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { + std::vector selected_indices = {1, 7}; + random::RandomArrayGenerator rand(kRandomSeed); + std::shared_ptr local_schema = schema({ + field("bool", boolean()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("double", float64()), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary()), + }); + auto batch = rand.BatchOf(local_schema->fields(), 100); + std::shared_ptr
table = Table::Make(local_schema, batch->columns()); + EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices)); + AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize, + &selected_indices); +} + class TestORCWriterTrivialWithConversion : public ::testing::Test { public: TestORCWriterTrivialWithConversion() { @@ -487,6 +573,21 @@ TEST_F(TestORCWriterTrivialWithConversion, writeChunkless) { kDefaultSmallMemStreamSize / 16); } +class TestORCWriterInvalidTypes : public ::testing::Test {}; + +TEST_F(TestORCWriterInvalidTypes, noWriteInvalidTypes) { + // Unsigned integers are not supported by ORC + std::shared_ptr table_schema = schema({field("uint64", uint64())}); + const std::shared_ptr
table = GenerateRandomTable(table_schema, 100, 1, 1, 0); + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(kDefaultSmallMemStreamSize / 16)); + EXPECT_OK_AND_ASSIGN(auto writer, + adapters::orc::ORCFileWriter::Open(buffer_output_stream.get())); + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + testing::HasSubstr("Unknown or unsupported Arrow type"), + writer->Write(*table)); +} + // General class TestORCWriterNoConversion : public ::testing::Test { @@ -730,4 +831,69 @@ TEST_F(TestORCWriterSingleArray, WriteListOfMap) { AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); } +class TestORCWriterMultipleWrite : public ::testing::Test { + public: + TestORCWriterMultipleWrite() : rand(kRandomSeed) {} + + protected: + random::RandomArrayGenerator rand; +}; + +TEST_F(TestORCWriterMultipleWrite, MultipleWritesIntField) { + const int64_t num_rows = 1234; + const int num_writes = 5; + std::shared_ptr input_schema = schema({field("col0", int32())}); + ArrayVector vect; + std::vector> input_tables; + for (int i = 0; i < num_writes; i++) { + auto array_int = rand.ArrayOf(int32(), num_rows, 0); + vect.push_back(array_int); + auto input_chunked_array = std::make_shared(array_int); + input_tables.emplace_back(Table::Make(input_schema, {input_chunked_array})); + } + auto expected_output_chunked_array = std::make_shared(vect); + std::shared_ptr
expected_output_table = + Table::Make(input_schema, {expected_output_chunked_array}); + AssertTableWriteReadEqual(input_tables, expected_output_table, + kDefaultSmallMemStreamSize * 100); +} + +TEST_F(TestORCWriterMultipleWrite, MultipleWritesIncoherentSchema) { + const int64_t num_rows = 1234; + auto array_int = rand.ArrayOf(int32(), num_rows, 0); + std::shared_ptr input_schema = schema({field("col0", array_int->type())}); + auto array_int2 = rand.ArrayOf(int64(), num_rows, 0); + std::shared_ptr input_schema2 = schema({field("col0", array_int2->type())}); + + std::shared_ptr
input_table = Table::Make(input_schema, {array_int}); + std::shared_ptr
input_table2 = Table::Make(input_schema2, {array_int2}); + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(kDefaultSmallMemStreamSize)); + auto write_options = adapters::orc::WriteOptions(); + EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( + buffer_output_stream.get(), write_options)); + ARROW_EXPECT_OK(writer->Write(*input_table)); + + // This should not pass + ASSERT_RAISES(TypeError, writer->Write(*input_table2)); + + ARROW_EXPECT_OK(writer->Close()); +} +TEST_F(TestORCWriterMultipleWrite, MultipleWritesIntFieldRecordBatch) { + const int64_t num_rows = 1234; + const int num_writes = 5; + std::shared_ptr input_schema = schema({field("col0", int32())}); + ArrayVector vect; + std::vector> input_batches; + for (int i = 0; i < num_writes; i++) { + auto array_int = rand.ArrayOf(int32(), num_rows, 0); + vect.push_back(array_int); + input_batches.emplace_back(RecordBatch::Make(input_schema, num_rows, {array_int})); + } + auto expected_output_chunked_array = std::make_shared(vect); + std::shared_ptr
expected_output_table = + Table::Make(input_schema, {expected_output_chunked_array}); + AssertBatchWriteReadEqual(input_batches, expected_output_table, + kDefaultSmallMemStreamSize * 100); +} } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index dbdb110fb46..234fb32bee6 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/array/builder_base.h" @@ -30,7 +31,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/range.h" -#include "arrow/util/string_view.h" +#include "arrow/util/string.h" #include "arrow/visit_data_inline.h" #include "orc/Exceptions.hh" @@ -43,6 +44,7 @@ namespace liborc = orc; namespace arrow { using internal::checked_cast; +using internal::ToChars; namespace adapters { namespace orc { @@ -462,7 +464,7 @@ struct Appender { running_arrow_offset++; return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { batch->notNull[running_orc_offset] = true; COffsetType data_length = 0; batch->data[running_orc_offset] = reinterpret_cast( @@ -486,7 +488,7 @@ struct Appender { running_arrow_offset++; return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { batch->notNull[running_orc_offset] = true; const Decimal128 dec_value(array.GetValue(running_arrow_offset)); batch->values[running_orc_offset] = static_cast(dec_value.low_bits()); @@ -507,7 +509,7 @@ struct Appender { running_arrow_offset++; return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { batch->notNull[running_orc_offset] = true; const Decimal128 dec_value(array.GetValue(running_arrow_offset)); batch->values[running_orc_offset] = @@ -557,7 +559,7 @@ struct FixedSizeBinaryAppender { running_arrow_offset++; return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { batch->notNull[running_orc_offset] = true; batch->data[running_orc_offset] = reinterpret_cast( const_cast(array.GetValue(running_arrow_offset))); @@ -1020,7 +1022,7 @@ Result> GetArrowType(const liborc::Type* type) { std::vector type_codes(subtype_count); for (int child = 0; child < subtype_count; ++child) { ARROW_ASSIGN_OR_RAISE(auto elem_type, GetArrowType(type->getSubtype(child))); - fields[child] = field("_union_" + std::to_string(child), std::move(elem_type)); + fields[child] = field("_union_" + ToChars(child), std::move(elem_type)); type_codes[child] = static_cast(child); } return sparse_union(std::move(fields), std::move(type_codes)); diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index b1892e1f2c8..2333a0c06fb 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -207,7 +207,7 @@ class ARROW_EXPORT Array { private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); - ARROW_EXPORT friend void PrintTo(const Array& x, std::ostream* os); + ARROW_FRIEND_EXPORT friend void PrintTo(const Array& x, std::ostream* os); }; static inline std::ostream& operator<<(std::ostream& os, const Array& x) { diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 04ee804987f..7e58a96ff84 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "arrow/array/array_base.h" @@ -32,7 +33,6 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" -#include "arrow/util/string_view.h" // IWYU pragma: export #include "arrow/util/visibility.h" namespace arrow { @@ -67,15 +67,15 @@ class BaseBinaryArray : public FlatArray { /// /// \param i the value index /// \return the view over the selected value - util::string_view GetView(int64_t i) const { + std::string_view GetView(int64_t i) const { // Account for base offset i += data_->offset; const offset_type pos = raw_value_offsets_[i]; - return util::string_view(reinterpret_cast(raw_data_ + pos), - raw_value_offsets_[i + 1] - pos); + return std::string_view(reinterpret_cast(raw_data_ + pos), + raw_value_offsets_[i + 1] - pos); } - util::optional operator[](int64_t i) const { + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } @@ -84,7 +84,7 @@ class BaseBinaryArray : public FlatArray { /// /// \param i the value index /// \return the view over the selected value - util::string_view Value(int64_t i) const { return GetView(i); } + std::string_view Value(int64_t i) const { return GetView(i); } /// \brief Get binary value as a std::string /// @@ -236,11 +236,11 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const uint8_t* GetValue(int64_t i) const; const uint8_t* Value(int64_t i) const { return GetValue(i); } - util::string_view GetView(int64_t i) const { - return util::string_view(reinterpret_cast(GetValue(i)), byte_width()); + std::string_view GetView(int64_t i) const { + return std::string_view(reinterpret_cast(GetValue(i)), byte_width()); } - util::optional operator[](int64_t i) const { + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index b7225eb8b7d..3bc9bb91a02 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -37,7 +38,6 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/string_view.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -63,7 +63,7 @@ void CheckStringArray(const ArrayType& array, const std::vector& st auto view = array.GetView(i); ASSERT_EQ(value_pos, array.value_offset(i)); ASSERT_EQ(strings[j].size(), view.size()); - ASSERT_EQ(util::string_view(strings[j]), view); + ASSERT_EQ(std::string_view(strings[j]), view); value_pos += static_cast(view.size()); } else { ASSERT_TRUE(array.IsNull(i)); @@ -256,7 +256,7 @@ class TestStringArray : public ::testing::Test { } Status ValidateFull(int64_t length, std::vector offsets, - util::string_view data, int64_t offset = 0) { + std::string_view data, int64_t offset = 0) { ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateFull(); @@ -373,7 +373,7 @@ class TestUTF8Array : public ::testing::Test { using ArrayType = typename TypeTraits::ArrayType; Status ValidateUTF8(int64_t length, std::vector offsets, - util::string_view data, int64_t offset = 0) { + std::string_view data, int64_t offset = 0) { ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateUTF8(); @@ -867,12 +867,12 @@ struct BinaryAppender { return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { data.push_back(v); return Status::OK(); } - std::vector data; + std::vector data; }; template diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 9193e1d21ac..bfa732f165f 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -711,7 +711,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) { // Build the dictionary Array auto value_type = fixed_size_binary(4); auto dict_array = ArrayFromJSON(value_type, R"(["abcd", "wxyz"])"); - util::string_view test = "abcd", test2 = "wxyz"; + std::string_view test = "abcd", test2 = "wxyz"; DictionaryBuilder builder(dict_array); ASSERT_OK(builder.Append(test)); ASSERT_OK(builder.Append(test2)); @@ -735,7 +735,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, MakeBuilder) { std::unique_ptr boxed_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); auto& builder = checked_cast&>(*boxed_builder); - util::string_view test = "abcd", test2 = "wxyz"; + std::string_view test = "abcd", test2 = "wxyz"; ASSERT_OK(builder.Append(test)); ASSERT_OK(builder.Append(test2)); ASSERT_OK(builder.Append(test)); @@ -1317,12 +1317,12 @@ TEST(TestDictionary, ListOfDictionary) { ASSERT_OK(list_builder->Append()); std::vector expected; - for (char a : util::string_view("abc")) { - for (char d : util::string_view("def")) { - for (char g : util::string_view("ghi")) { - for (char j : util::string_view("jkl")) { - for (char m : util::string_view("mno")) { - for (char p : util::string_view("pqr")) { + for (char a : std::string_view("abc")) { + for (char d : std::string_view("def")) { + for (char g : std::string_view("ghi")) { + for (char j : std::string_view("jkl")) { + for (char m : std::string_view("mno")) { + for (char p : std::string_view("pqr")) { if ((static_cast(a) + d + g + j + m + p) % 16 == 0) { ASSERT_OK(list_builder->Append()); } diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index def01379e06..f8c24b71e06 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -197,6 +197,42 @@ class TestListArray : public ::testing::Test { EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); } + void TestFromArraysWithNullBitMap() { + std::shared_ptr offsets_w_nulls, offsets_wo_nulls, values; + + std::vector offsets = {0, 1, 1, 3, 4}; + std::vector offsets_w_nulls_is_valid = {true, false, true, true, true}; + + ArrayFromVector(offsets_w_nulls_is_valid, offsets, + &offsets_w_nulls); + ArrayFromVector(offsets, &offsets_wo_nulls); + + auto type = std::make_shared(int32()); + auto expected = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], null, [0, null], [0]]")); + values = expected->values(); + + // Offsets with nulls will match. + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + AssertArraysEqual(*result, *expected); + + // Offets without nulls, will replace null with empty list + ASSERT_OK_AND_ASSIGN(result, + ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + AssertArraysEqual(*result, *std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); + + // Specify non-null offsets with null_bitmap + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, + expected->null_bitmap())); + AssertArraysEqual(*result, *expected); + + // Cannot specify both null offsets with null_bitmap + ASSERT_RAISES(Invalid, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_, + expected->null_bitmap())); + } + void TestFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; @@ -539,6 +575,10 @@ TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } +TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { + this->TestFromArraysWithNullBitMap(); +} + TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } TYPED_TEST(TestListArray, AppendNulls) { this->TestAppendNulls(); } @@ -607,11 +647,11 @@ TEST_F(TestMapArray, Equality) { std::shared_ptr array, equal_array, unequal_array; std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; - std::vector equal_keys = {"a", "a", "a", "b", "c", - "a", "a", "a", "a", "b"}; + std::vector equal_keys = {"a", "a", "a", "b", "c", + "a", "a", "a", "a", "b"}; std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; std::vector unequal_offsets = {0, 1, 4, 7}; - std::vector unequal_keys = {"a", "a", "b", "c", "a", "b", "c"}; + std::vector unequal_keys = {"a", "a", "b", "c", "a", "b", "c"}; std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; // setup two equal arrays diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 64edec0c7aa..628259f0f6c 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -17,6 +17,7 @@ #include "arrow/array/array_nested.h" +#include #include #include #include @@ -33,7 +34,6 @@ #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" -#include "arrow/util/atomic_shared_ptr.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" @@ -103,7 +103,8 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool, template Result::ArrayType>> ListArrayFromArrays( std::shared_ptr type, const Array& offsets, const Array& values, - MemoryPool* pool) { + MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { using offset_type = typename TYPE::offset_type; using ArrayType = typename TypeTraits::ArrayType; using OffsetArrowType = typename CTypeTraits::ArrowType; @@ -116,14 +117,24 @@ Result::ArrayType>> ListArrayFromArray return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); } + if (null_bitmap != nullptr && offsets.null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets with nulls"); + } + + if (null_bitmap != nullptr && offsets.offset() != 0) { + return Status::NotImplemented("Null bitmap with offsets slice not supported."); + } + std::shared_ptr offset_buf, validity_buf; RETURN_NOT_OK(CleanListOffsets(offsets, pool, &offset_buf, &validity_buf)); - BufferVector buffers = {validity_buf, offset_buf}; + int64_t null_count_ = null_bitmap ? null_count : offsets.null_count(); + BufferVector buffers = {null_bitmap ? std::move(null_bitmap) : validity_buf, + offset_buf}; - auto internal_data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), - offsets.null_count(), offsets.offset()); + std::shared_ptr internal_data = ArrayData::Make( + type, offsets.length() - 1, std::move(buffers), null_count_, offsets.offset()); internal_data->child_data.push_back(values.data()); - return std::make_shared(internal_data); } @@ -231,17 +242,16 @@ void LargeListArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); } -Result> ListArray::FromArrays(const Array& offsets, - const Array& values, - MemoryPool* pool) { +Result> ListArray::FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays(std::make_shared(values.type()), offsets, - values, pool); + values, pool, null_bitmap, null_count); } -Result> ListArray::FromArrays(std::shared_ptr type, - const Array& offsets, - const Array& values, - MemoryPool* pool) { +Result> ListArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& values, + MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { if (type->id() != Type::LIST) { return Status::TypeError("Expected list type, got ", type->ToString()); } @@ -249,19 +259,21 @@ Result> ListArray::FromArrays(std::shared_ptrEquals(values.type())) { return Status::TypeError("Mismatching list value type"); } - return ListArrayFromArrays(std::move(type), offsets, values, pool); + return ListArrayFromArrays(std::move(type), offsets, values, pool, + null_bitmap, null_count); } -Result> LargeListArray::FromArrays(const Array& offsets, - const Array& values, - MemoryPool* pool) { +Result> LargeListArray::FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays( - std::make_shared(values.type()), offsets, values, pool); + std::make_shared(values.type()), offsets, values, pool, null_bitmap, + null_count); } Result> LargeListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, - MemoryPool* pool) { + MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { if (type->id() != Type::LARGE_LIST) { return Status::TypeError("Expected large list type, got ", type->ToString()); } @@ -269,7 +281,8 @@ Result> LargeListArray::FromArrays( if (!list_type.value_type()->Equals(values.type())) { return Status::TypeError("Mismatching list value type"); } - return ListArrayFromArrays(std::move(type), offsets, values, pool); + return ListArrayFromArrays(std::move(type), offsets, values, pool, + null_bitmap, null_count); } Result> ListArray::Flatten(MemoryPool* memory_pool) const { @@ -570,7 +583,7 @@ const ArrayVector& StructArray::fields() const { } const std::shared_ptr& StructArray::field(int i) const { - std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); + std::shared_ptr result = std::atomic_load(&boxed_fields_[i]); if (!result) { std::shared_ptr field_data; if (data_->offset != 0 || data_->child_data[i]->length != data_->length) { @@ -579,7 +592,7 @@ const std::shared_ptr& StructArray::field(int i) const { field_data = data_->child_data[i]; } std::shared_ptr result = MakeArray(field_data); - internal::atomic_store(&boxed_fields_[i], result); + std::atomic_store(&boxed_fields_[i], result); return boxed_fields_[i]; } return boxed_fields_[i]; @@ -834,7 +847,7 @@ std::shared_ptr UnionArray::field(int i) const { static_cast(i) >= boxed_fields_.size()) { return nullptr; } - std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); + std::shared_ptr result = std::atomic_load(&boxed_fields_[i]); if (!result) { std::shared_ptr child_data = data_->child_data[i]->Copy(); if (mode() == UnionMode::SPARSE) { @@ -846,7 +859,7 @@ std::shared_ptr UnionArray::field(int i) const { } } result = MakeArray(child_data); - internal::atomic_store(&boxed_fields_[i], result); + std::atomic_store(&boxed_fields_[i], result); } return result; } diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 5d04bef4f9e..6fb3fd3c918 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -69,9 +69,11 @@ class BaseListArray : public Array { const TypeClass* list_type() const { return list_type_; } /// \brief Return array object containing the list's values + /// + /// Note that this buffer does not account for any slice offset or length. std::shared_ptr values() const { return values_; } - /// Note that this buffer does not account for any slice offset + /// Note that this buffer does not account for any slice offset or length. std::shared_ptr value_offsets() const { return data_->buffers[1]; } std::shared_ptr value_type() const { return list_type_->value_type(); } @@ -120,18 +122,26 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// the offsets contain any nulls). If the offsets do not have nulls, they /// are assumed to be well-formed /// + /// Offsets of an Array's null bitmap can be present or an explicit + /// null_bitmap, but not both. + /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type /// \param[in] values Array containing list values /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap static Result> FromArrays( - const Array& offsets, const Array& values, - MemoryPool* pool = default_memory_pool()); + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); static Result> FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); /// \brief Return an Array that is a concatenation of the lists in this array. /// @@ -178,13 +188,18 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// \param[in] values Array containing list values /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap static Result> FromArrays( - const Array& offsets, const Array& values, - MemoryPool* pool = default_memory_pool()); + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); static Result> FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); /// \brief Return an Array that is a concatenation of the lists in this array. /// diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index 740a4806a4d..e6df92e3b78 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -54,7 +54,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { bool GetView(int64_t i) const { return Value(i); } - util::optional operator[](int64_t i) const { return *IteratorType(*this, i); } + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } /// \brief Return the number of false (0) values among the valid /// values. Result is not cached. @@ -111,7 +111,7 @@ class NumericArray : public PrimitiveArray { // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } - util::optional operator[](int64_t i) const { + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } @@ -152,7 +152,7 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } - util::optional operator[](int64_t i) const { + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } @@ -188,7 +188,7 @@ class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } - util::optional operator[](int64_t i) const { + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index d438557a330..d4ad1578b77 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -2254,12 +2254,12 @@ struct FWBinaryAppender { return Status::OK(); } - Status VisitValue(util::string_view v) { + Status VisitValue(std::string_view v) { data.push_back(v); return Status::OK(); } - std::vector data; + std::vector data; }; TEST_F(TestFWBinaryArray, ArraySpanVisitor) { @@ -2290,7 +2290,7 @@ TEST_F(TestFWBinaryArray, ArrayIndexOperator) { auto fsba = checked_pointer_cast(arr); ASSERT_EQ("abc", (*fsba)[0].value()); - ASSERT_EQ(util::nullopt, (*fsba)[1]); + ASSERT_EQ(std::nullopt, (*fsba)[1]); ASSERT_EQ("def", (*fsba)[2].value()); } @@ -2831,8 +2831,6 @@ class DecimalTest : public ::testing::TestWithParam { auto type = std::make_shared(precision, 4); auto builder = std::make_shared(type); - size_t null_count = 0; - const size_t size = draw.size(); ARROW_EXPECT_OK(builder->Reserve(size)); @@ -2842,7 +2840,6 @@ class DecimalTest : public ::testing::TestWithParam { ARROW_EXPECT_OK(builder->Append(draw[i])); } else { ARROW_EXPECT_OK(builder->AppendNull()); - ++null_count; } } @@ -3538,7 +3535,7 @@ TYPED_TEST(TestPrimitiveArray, IndexOperator) { ASSERT_EQ(this->values_[i], res.value()); } else { ASSERT_FALSE(res.has_value()); - ASSERT_EQ(res, util::nullopt); + ASSERT_EQ(res, std::nullopt); } } } diff --git a/cpp/src/arrow/array/builder_adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc index 36e5546a749..f6255a564fc 100644 --- a/cpp/src/arrow/array/builder_adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -33,8 +33,11 @@ namespace arrow { using internal::AdaptiveIntBuilderBase; -AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool) - : ArrayBuilder(pool), start_int_size_(start_int_size), int_size_(start_int_size) {} +AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool, + int64_t alignment) + : ArrayBuilder(pool, alignment), + start_int_size_(start_int_size), + int_size_(start_int_size) {} void AdaptiveIntBuilderBase::Reset() { ArrayBuilder::Reset(); @@ -125,8 +128,9 @@ std::shared_ptr AdaptiveIntBuilder::type() const { return nullptr; } -AdaptiveIntBuilder::AdaptiveIntBuilder(uint8_t start_int_size, MemoryPool* pool) - : AdaptiveIntBuilderBase(start_int_size, pool) {} +AdaptiveIntBuilder::AdaptiveIntBuilder(uint8_t start_int_size, MemoryPool* pool, + int64_t alignment) + : AdaptiveIntBuilderBase(start_int_size, pool, alignment) {} Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(CommitPendingData()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 1c727c78b0e..382c35789c4 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -39,10 +39,12 @@ namespace internal { class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { public: - AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool); + AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment); - explicit AdaptiveIntBuilderBase(MemoryPool* pool) - : AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {} + explicit AdaptiveIntBuilderBase(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {} /// \brief Append multiple nulls /// \param[in] length the number of nulls to append @@ -173,10 +175,12 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { public: explicit AdaptiveIntBuilder(uint8_t start_int_size, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); - explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool()) - : AdaptiveIntBuilder(sizeof(uint8_t), pool) {} + explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {} using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index ff37cee5ba1..e9d5fb44ac1 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -144,7 +144,7 @@ struct AppendScalarImpl { raw++) { auto scalar = checked_cast::ScalarType*>(raw->get()); if (scalar->is_valid) { - builder->UnsafeAppend(util::string_view{*scalar->value}); + builder->UnsafeAppend(std::string_view{*scalar->value}); } else { builder->UnsafeAppendNull(); } diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index bc4932a4b83..89e4debd18a 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -69,7 +69,8 @@ constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1 /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. class ARROW_EXPORT ArrayBuilder { public: - explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} + explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment) + : pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {} ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder); @@ -283,6 +284,7 @@ class ARROW_EXPORT ArrayBuilder { const char* message); MemoryPool* pool_; + int64_t alignment_; TypedBufferBuilder null_bitmap_builder_; int64_t null_count_ = 0; diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index fd1be179816..571f450aab9 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -44,10 +44,10 @@ using internal::checked_cast; // Fixed width binary FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(pool), + MemoryPool* pool, int64_t alignment) + : ArrayBuilder(pool, alignment), byte_width_(checked_cast(*type).byte_width()), - byte_builder_(pool) {} + byte_builder_(pool, alignment) {} void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) { DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder"; @@ -123,10 +123,10 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { return data_ptr + i * byte_width_; } -util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { +std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return util::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), + byte_width_); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 25cec5c1e25..25183ca169c 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "arrow/array/array_base.h" @@ -36,7 +37,6 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/util/string_view.h" // IWYU pragma: export #include "arrow/util/visibility.h" namespace arrow { @@ -54,8 +54,11 @@ class BaseBinaryBuilder : public ArrayBuilder { using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; - explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {} + explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), + value_data_builder_(pool, alignment) {} BaseBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) : BaseBinaryBuilder(pool) {} @@ -77,7 +80,7 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(reinterpret_cast(value), length); } - Status Append(util::string_view value) { + Status Append(std::string_view value) { return Append(value.data(), static_cast(value.size())); } @@ -93,7 +96,7 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status ExtendCurrent(util::string_view value) { + Status ExtendCurrent(std::string_view value) { return ExtendCurrent(reinterpret_cast(value.data()), static_cast(value.size())); } @@ -150,7 +153,7 @@ class BaseBinaryBuilder : public ArrayBuilder { UnsafeAppend(value.c_str(), static_cast(value.size())); } - void UnsafeAppend(util::string_view value) { + void UnsafeAppend(std::string_view value) { UnsafeAppend(value.data(), static_cast(value.size())); } @@ -159,7 +162,7 @@ class BaseBinaryBuilder : public ArrayBuilder { value_data_builder_.UnsafeAppend(value, length); } - void UnsafeExtendCurrent(util::string_view value) { + void UnsafeExtendCurrent(std::string_view value) { UnsafeExtendCurrent(reinterpret_cast(value.data()), static_cast(value.size())); } @@ -370,10 +373,10 @@ class BaseBinaryBuilder : public ArrayBuilder { /// Temporary access to a value. /// /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const { + std::string_view GetView(int64_t i) const { offset_type value_length; const uint8_t* value_data = GetValue(i, &value_length); - return util::string_view(reinterpret_cast(value_data), value_length); + return std::string_view(reinterpret_cast(value_data), value_length); } // Cannot make this a static attribute because of linking issues @@ -464,7 +467,8 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { using TypeClass = FixedSizeBinaryType; explicit FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); Status Append(const uint8_t* value) { ARROW_RETURN_NOT_OK(Reserve(1)); @@ -476,7 +480,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Append(reinterpret_cast(value)); } - Status Append(const util::string_view& view) { + Status Append(const std::string_view& view) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(view); return Status::OK(); @@ -490,7 +494,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(util::string_view(s)); + UnsafeAppend(std::string_view(s)); return Status::OK(); } @@ -500,7 +504,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const std::array& value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend( - util::string_view(reinterpret_cast(value.data()), value.size())); + std::string_view(reinterpret_cast(value.data()), value.size())); return Status::OK(); } @@ -534,14 +538,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value)); } - void UnsafeAppend(util::string_view value) { + void UnsafeAppend(std::string_view value) { #ifndef NDEBUG CheckValueSize(static_cast(value.size())); #endif UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } @@ -590,7 +594,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { /// Temporary access to a value. /// /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; + std::string_view GetView(int64_t i) const; static constexpr int64_t memory_limit() { return std::numeric_limits::max() - 1; @@ -658,7 +662,7 @@ class ARROW_EXPORT ChunkedBinaryBuilder { return builder_->Append(value, length); } - Status Append(const util::string_view& value) { + Status Append(const std::string_view& value) { return Append(reinterpret_cast(value.data()), static_cast(value.size())); } diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc index bd7615a7309..3b1262819df 100644 --- a/cpp/src/arrow/array/builder_decimal.cc +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -36,8 +36,8 @@ class MemoryPool; // Decimal128Builder Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool), + MemoryPool* pool, int64_t alignment) + : FixedSizeBinaryBuilder(type, pool, alignment), decimal_type_(internal::checked_pointer_cast(type)) {} Status Decimal128Builder::Append(Decimal128 value) { @@ -52,7 +52,7 @@ void Decimal128Builder::UnsafeAppend(Decimal128 value) { UnsafeAppendToBitmap(true); } -void Decimal128Builder::UnsafeAppend(util::string_view value) { +void Decimal128Builder::UnsafeAppend(std::string_view value) { FixedSizeBinaryBuilder::UnsafeAppend(value); } @@ -71,8 +71,8 @@ Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { // Decimal256Builder Decimal256Builder::Decimal256Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool), + MemoryPool* pool, int64_t alignment) + : FixedSizeBinaryBuilder(type, pool, alignment), decimal_type_(internal::checked_pointer_cast(type)) {} Status Decimal256Builder::Append(const Decimal256& value) { @@ -87,7 +87,7 @@ void Decimal256Builder::UnsafeAppend(const Decimal256& value) { UnsafeAppendToBitmap(true); } -void Decimal256Builder::UnsafeAppend(util::string_view value) { +void Decimal256Builder::UnsafeAppend(std::string_view value) { FixedSizeBinaryBuilder::UnsafeAppend(value); } diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h index 3464203dd47..8094250aef8 100644 --- a/cpp/src/arrow/array/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -39,7 +39,8 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { using ValueType = Decimal128; explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); using FixedSizeBinaryBuilder::Append; using FixedSizeBinaryBuilder::AppendValues; @@ -47,7 +48,7 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { Status Append(Decimal128 val); void UnsafeAppend(Decimal128 val); - void UnsafeAppend(util::string_view val); + void UnsafeAppend(std::string_view val); Status FinishInternal(std::shared_ptr* out) override; @@ -69,7 +70,8 @@ class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { using ValueType = Decimal256; explicit Decimal256Builder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); using FixedSizeBinaryBuilder::Append; using FixedSizeBinaryBuilder::AppendValues; @@ -77,7 +79,7 @@ class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { Status Append(const Decimal256& val); void UnsafeAppend(const Decimal256& val); - void UnsafeAppend(util::string_view val); + void UnsafeAppend(std::string_view val); Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index d51dd4c041a..061fb600412 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -188,12 +188,12 @@ GET_OR_INSERT(MonthIntervalType); #undef GET_OR_INSERT -Status DictionaryMemoTable::GetOrInsert(const BinaryType*, util::string_view value, +Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); } -Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, util::string_view value, +Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); } diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index b720f73d7d2..cb0aaf30991 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -54,7 +54,7 @@ struct DictionaryValue { template struct DictionaryValue> { - using type = util::string_view; + using type = std::string_view; using PhysicalType = typename std::conditional::value, BinaryType, LargeBinaryType>::type; @@ -62,7 +62,7 @@ struct DictionaryValue> { template struct DictionaryValue> { - using type = util::string_view; + using type = std::string_view; using PhysicalType = BinaryType; }; @@ -112,8 +112,8 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const FloatType*, float value, int32_t* out); Status GetOrInsert(const DoubleType*, double value, int32_t* out); - Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out); - Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out); + Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); class DictionaryMemoTableImpl; std::unique_ptr impl_; @@ -146,24 +146,26 @@ class DictionaryBuilderBase : public ArrayBuilder { !is_fixed_size_binary_type::value, const std::shared_ptr&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), - indices_builder_(start_int_size, pool), + indices_builder_(start_int_size, pool, alignment), value_type_(value_type) {} template explicit DictionaryBuilderBase( enable_if_t::value, const std::shared_ptr&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), - indices_builder_(pool), + indices_builder_(pool, alignment), value_type_(value_type) {} template @@ -171,12 +173,13 @@ class DictionaryBuilderBase : public ArrayBuilder { const std::shared_ptr& index_type, enable_if_t::value, const std::shared_ptr&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), - indices_builder_(index_type, pool), + indices_builder_(index_type, pool, alignment), value_type_(value_type) {} template @@ -185,35 +188,38 @@ class DictionaryBuilderBase : public ArrayBuilder { is_fixed_size_binary_type::value, const std::shared_ptr&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), - indices_builder_(start_int_size, pool), + indices_builder_(start_int_size, pool, alignment), value_type_(value_type) {} template explicit DictionaryBuilderBase( enable_if_fixed_size_binary&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), - indices_builder_(pool), + indices_builder_(pool, alignment), value_type_(value_type) {} template explicit DictionaryBuilderBase( const std::shared_ptr& index_type, enable_if_fixed_size_binary&> value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), - indices_builder_(index_type, pool), + indices_builder_(index_type, pool, alignment), value_type_(value_type) {} template @@ -223,12 +229,13 @@ class DictionaryBuilderBase : public ArrayBuilder { // This constructor doesn't check for errors. Use InsertMemoValues instead. explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), delta_offset_(0), byte_width_(-1), - indices_builder_(pool), + indices_builder_(pool, alignment), value_type_(dictionary->type()) {} ~DictionaryBuilderBase() override = default; @@ -257,13 +264,13 @@ class DictionaryBuilderBase : public ArrayBuilder { /// \brief Append a fixed-width string (only for FixedSizeBinaryType) template enable_if_fixed_size_binary Append(const uint8_t* value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); + return Append(std::string_view(reinterpret_cast(value), byte_width_)); } /// \brief Append a fixed-width string (only for FixedSizeBinaryType) template enable_if_fixed_size_binary Append(const char* value) { - return Append(util::string_view(value, byte_width_)); + return Append(std::string_view(value, byte_width_)); } /// \brief Append a string (only for binary types) @@ -275,13 +282,13 @@ class DictionaryBuilderBase : public ArrayBuilder { /// \brief Append a string (only for binary types) template enable_if_binary_like Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); + return Append(std::string_view(value, length)); } /// \brief Append a string (only for string types) template enable_if_string_like Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); + return Append(std::string_view(value, length)); } /// \brief Append a decimal (only for Decimal128Type) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 306d861b09f..3e9328bfdf0 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -51,14 +51,16 @@ class BaseListBuilder : public ArrayBuilder { /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(pool), - offsets_builder_(pool), + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder) - : BaseListBuilder(pool, value_builder, list(value_builder->type())) {} + BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {} Status Resize(int64_t capacity) override { if (capacity > maximum_elements()) { diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index 769c2f7d07b..adff9c2acca 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -44,11 +44,12 @@ Status NullBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } -BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(pool), data_builder_(pool) {} +BooleanBuilder::BooleanBuilder(MemoryPool* pool, int64_t alignment) + : ArrayBuilder(pool, alignment), data_builder_(pool, alignment) {} -BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) - : BooleanBuilder(pool) { +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool, + int64_t alignment) + : BooleanBuilder(pool, alignment) { ARROW_CHECK_EQ(Type::BOOL, type->id()); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 8f2dcc8b09b..4102aa99acf 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -31,10 +31,13 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: - explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} + explicit NullBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool) {} explicit NullBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()) - : NullBuilder(pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NullBuilder(pool, alignment) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { @@ -82,11 +85,15 @@ class NumericBuilder : public ArrayBuilder { template explicit NumericBuilder( - enable_if_parameter_free pool = default_memory_pool()) - : ArrayBuilder(pool), type_(TypeTraits::type_singleton()), data_builder_(pool) {} + enable_if_parameter_free pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + type_(TypeTraits::type_singleton()), + data_builder_(pool, alignment) {} - NumericBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(pool), type_(type), data_builder_(pool) {} + NumericBuilder(const std::shared_ptr& type, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {} /// Append a single scalar and increase the size if necessary. Status Append(const value_type val) { @@ -131,7 +138,10 @@ class NumericBuilder : public ArrayBuilder { value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } - void Reset() override { data_builder_.Reset(); } + void Reset() override { + data_builder_.Reset(); + ArrayBuilder::Reset(); + } Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); @@ -344,10 +354,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { using TypeClass = BooleanType; using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool = default_memory_pool()); + explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); BooleanBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory Status AppendNulls(int64_t length) final { diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h index 97282a3fd86..da29ae3124b 100644 --- a/cpp/src/arrow/array/builder_time.h +++ b/cpp/src/arrow/array/builder_time.h @@ -36,12 +36,14 @@ class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder type, - MemoryPool* pool = default_memory_pool()) - : NumericBuilder(type, pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} }; class ARROW_EXPORT MonthDayNanoIntervalBuilder @@ -49,12 +51,14 @@ class ARROW_EXPORT MonthDayNanoIntervalBuilder public: using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos; - explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool()) - : MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool) {} + explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {} explicit MonthDayNanoIntervalBuilder(std::shared_ptr type, - MemoryPool* pool = default_memory_pool()) - : NumericBuilder(type, pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} }; /// @} diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 883cda3d8b7..d6200d9001a 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -68,9 +68,12 @@ Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { } BasicUnionBuilder::BasicUnionBuilder( - MemoryPool* pool, const std::vector>& children, + MemoryPool* pool, int64_t alignment, + const std::vector>& children, const std::shared_ptr& type) - : ArrayBuilder(pool), child_fields_(children.size()), types_builder_(pool) { + : ArrayBuilder(pool, alignment), + child_fields_(children.size()), + types_builder_(pool, alignment) { const auto& union_type = checked_cast(*type); mode_ = union_type.mode(); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index eb8c5d3af0e..718ef4c32ce 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -67,7 +67,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { int64_t length() const override { return types_builder_.length(); } protected: - BasicUnionBuilder(MemoryPool* pool, + BasicUnionBuilder(MemoryPool* pool, int64_t alignment, const std::vector>& children, const std::shared_ptr& type); @@ -92,15 +92,19 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { /// Use this constructor to initialize the UnionBuilder with no child builders, /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. - explicit DenseUnionBuilder(MemoryPool* pool) - : BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {} + explicit DenseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})), + offsets_builder_(pool, alignment) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor DenseUnionBuilder(MemoryPool* pool, const std::vector>& children, - const std::shared_ptr& type) - : BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {} + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type), + offsets_builder_(pool, alignment) {} Status AppendNull() final { const int8_t first_child_code = type_codes_[0]; @@ -177,15 +181,17 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// Use this constructor to initialize the UnionBuilder with no child builders, /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. - explicit SparseUnionBuilder(MemoryPool* pool) - : BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {} + explicit SparseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor SparseUnionBuilder(MemoryPool* pool, const std::vector>& children, - const std::shared_ptr& type) - : BasicUnionBuilder(pool, children, type) {} + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type) {} /// \brief Append a null value. /// diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 9f77fbb5f43..aab734284fa 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -311,8 +311,8 @@ class ConcatenateImpl { /*dest_offset=*/position, run.length, transpose_map)); } else { - std::fill(out_data + position, - out_data + position + (run.length * index_width), 0x00); + std::fill(out_data + (position * index_width), + out_data + (position + run.length) * index_width, 0x00); } position += run.length; diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index aacd7518928..bff5d7eec1e 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -539,4 +539,15 @@ TEST_F(ConcatenateTest, OffsetOverflow) { ASSERT_RAISES(Invalid, Concatenate({fake_long, fake_long}).status()); } +TEST_F(ConcatenateTest, DictionaryConcatenateWithEmptyUint16) { + // Regression test for ARROW-17733 + auto dict_type = dictionary(uint16(), utf8()); + auto dict_one = DictArrayFromJSON(dict_type, "[]", "[]"); + auto dict_two = + DictArrayFromJSON(dict_type, "[0, 1, null, null, null, null]", "[\"A0\", \"A1\"]"); + ASSERT_OK_AND_ASSIGN(auto concat_actual, Concatenate({dict_one, dict_two})); + + AssertArraysEqual(*dict_two, *concat_actual); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index dde66ac79c4..e024483f665 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -167,6 +167,11 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr Copy() const { return std::make_shared(*this); } + bool IsNull(int64_t i) const { + return ((buffers[0] != NULLPTR) ? !bit_util::GetBit(buffers[0]->data(), i + offset) + : null_count.load() == length); + } + // Access a buffer's data as a typed C pointer template inline const T* GetValues(int i, int64_t absolute_offset) const { @@ -324,18 +329,14 @@ struct ARROW_EXPORT ArraySpan { return GetValues(i, this->offset); } - bool IsNull(int64_t i) const { - return ((this->buffers[0].data != NULLPTR) - ? !bit_util::GetBit(this->buffers[0].data, i + this->offset) - : this->null_count == this->length); - } - - bool IsValid(int64_t i) const { + inline bool IsValid(int64_t i) const { return ((this->buffers[0].data != NULLPTR) ? bit_util::GetBit(this->buffers[0].data, i + this->offset) : this->null_count != this->length); } + inline bool IsNull(int64_t i) const { return !IsValid(i); } + std::shared_ptr ToArrayData() const; std::shared_ptr ToArray() const; diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index a8b69133cfe..5245c8d0ff3 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" -#include "arrow/util/string_view.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 16f4f9c7638..9fbb5df2c01 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,6 @@ #include "arrow/util/logging.h" #include "arrow/util/range.h" #include "arrow/util/string.h" -#include "arrow/util/string_view.h" #include "arrow/vendored/datetime.h" #include "arrow/visit_type_inline.h" @@ -399,8 +399,8 @@ class MakeFormatterImpl { } private: - template - friend Status VisitTypeInline(const DataType&, VISITOR*); + template + friend Status VisitTypeInline(const DataType&, VISITOR*, ARGS&&... args); // factory implementation Status Visit(const BooleanType&) { diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 05155d64b6a..c1a37c4234e 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -54,7 +54,7 @@ struct UTF8DataValidator { int64_t i = 0; return VisitArraySpanInline( data, - [&](util::string_view v) { + [&](std::string_view v) { if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { return Status::Invalid("Invalid UTF8 sequence at string index ", i); } @@ -459,14 +459,17 @@ struct ValidateArrayImpl { if (buffer == nullptr) { continue; } - int64_t min_buffer_size = -1; + int64_t min_buffer_size = 0; switch (spec.kind) { case DataTypeLayout::BITMAP: - min_buffer_size = bit_util::BytesForBits(length_plus_offset); + // If length == 0, buffer size can be 0 regardless of offset + if (data.length > 0) { + min_buffer_size = bit_util::BytesForBits(length_plus_offset); + } break; case DataTypeLayout::FIXED_WIDTH: - if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, - &min_buffer_size)) { + if (data.length > 0 && MultiplyWithOverflow(length_plus_offset, spec.byte_width, + &min_buffer_size)) { return Status::Invalid("Array of type ", type.ToString(), " has impossibly large length and offset"); } @@ -675,7 +678,7 @@ struct ValidateArrayImpl { const int32_t precision = type.precision(); return VisitArraySpanInline( data, - [&](util::string_view bytes) { + [&](std::string_view bytes) { DCHECK_EQ(bytes.size(), DecimalType::kByteWidth); CType value(reinterpret_cast(bytes.data())); if (!value.FitsInPrecision(precision)) { diff --git a/cpp/src/arrow/arrow-testing.pc.in b/cpp/src/arrow/arrow-testing.pc.in index 5a991e796d8..9a452a5d113 100644 --- a/cpp/src/arrow/arrow-testing.pc.in +++ b/cpp/src/arrow/arrow-testing.pc.in @@ -27,3 +27,4 @@ Version: @ARROW_VERSION@ Requires: arrow Libs: -L${libdir} -larrow_testing Cflags: -I${gtest_includedir} +Cflags.private: -DARROW_TESTING_STATIC diff --git a/cpp/src/arrow/arrow.pc.in b/cpp/src/arrow/arrow.pc.in index 3a5710ab6b8..309789379a5 100644 --- a/cpp/src/arrow/arrow.pc.in +++ b/cpp/src/arrow/arrow.pc.in @@ -26,7 +26,9 @@ full_so_version=@ARROW_FULL_SO_VERSION@ Name: Apache Arrow Description: Arrow is a set of technologies that enable big-data systems to process and move data fast. Version: @ARROW_VERSION@ +Requires:@ARROW_PC_REQUIRES@ Requires.private:@ARROW_PC_REQUIRES_PRIVATE@ -Libs: -L${libdir} -larrow +Libs: -L${libdir} -larrow@ARROW_PC_LIBS@ Libs.private:@ARROW_PC_LIBS_PRIVATE@ -Cflags: -I${includedir} +Cflags: -I${includedir}@ARROW_PC_CFLAGS@ +Cflags.private:@ARROW_PC_CFLAGS_PRIVATE@ diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index e7566354d12..afe3d773594 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -185,7 +185,13 @@ Result> AllocateBitmap(int64_t length, MemoryPool* pool) } Result> AllocateEmptyBitmap(int64_t length, MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(bit_util::BytesForBits(length), pool)); + return AllocateEmptyBitmap(length, kDefaultBufferAlignment, pool); +} + +Result> AllocateEmptyBitmap(int64_t length, int64_t alignment, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE(auto buf, + AllocateBuffer(bit_util::BytesForBits(length), alignment, pool)); memset(buf->mutable_data(), 0, static_cast(buf->size())); return std::move(buf); } diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 8be10d282b0..9270c4dea3f 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -21,14 +21,15 @@ #include #include #include +#include #include #include #include "arrow/device.h" #include "arrow/status.h" #include "arrow/type_fwd.h" +#include "arrow/util/bytes_view.h" #include "arrow/util/macros.h" -#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -77,7 +78,7 @@ class ARROW_EXPORT Buffer { /// /// \note The memory viewed by data must not be deallocated in the lifetime of the /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere - explicit Buffer(util::string_view data) + explicit Buffer(std::string_view data) : Buffer(reinterpret_cast(data.data()), static_cast(data.size())) {} @@ -159,10 +160,10 @@ class ARROW_EXPORT Buffer { /// \note Can throw std::bad_alloc if buffer is large std::string ToString() const; - /// \brief View buffer contents as a util::string_view - /// \return util::string_view - explicit operator util::string_view() const { - return util::string_view(reinterpret_cast(data_), size_); + /// \brief View buffer contents as a std::string_view + /// \return std::string_view + explicit operator std::string_view() const { + return std::string_view(reinterpret_cast(data_), size_); } /// \brief View buffer contents as a util::bytes_view @@ -460,6 +461,9 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { ARROW_EXPORT Result> AllocateBuffer(const int64_t size, MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateBuffer(const int64_t size, int64_t alignment, + MemoryPool* pool = NULLPTR); /// \brief Allocate a resizeable buffer from a memory pool, zero its padding. /// @@ -468,6 +472,9 @@ Result> AllocateBuffer(const int64_t size, ARROW_EXPORT Result> AllocateResizableBuffer( const int64_t size, MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateResizableBuffer( + const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR); /// \brief Allocate a bitmap buffer from a memory pool /// no guarantee on values is provided. @@ -478,9 +485,6 @@ ARROW_EXPORT Result> AllocateBitmap(int64_t length, MemoryPool* pool = NULLPTR); -ARROW_EXPORT -Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out); - /// \brief Allocate a zero-initialized bitmap buffer from a memory pool /// /// \param[in] length size in bits of bitmap to allocate @@ -489,6 +493,10 @@ ARROW_EXPORT Result> AllocateEmptyBitmap(int64_t length, MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateEmptyBitmap(int64_t length, int64_t alignment, + MemoryPool* pool = NULLPTR); + /// \brief Concatenate multiple buffers into a single buffer /// /// \param[in] buffers to be concatenated @@ -497,10 +505,6 @@ ARROW_EXPORT Result> ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool = NULLPTR); -ARROW_EXPORT -Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out); - /// @} } // namespace arrow diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index d92a01a16eb..5f37e552004 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -43,23 +43,27 @@ namespace arrow { /// data class ARROW_EXPORT BufferBuilder { public: - explicit BufferBuilder(MemoryPool* pool = default_memory_pool()) + explicit BufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) : pool_(pool), data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ util::MakeNonNull()), capacity_(0), - size_(0) {} + size_(0), + alignment_(alignment) {} /// \brief Constructs new Builder that will start using /// the provided buffer until Finish/Reset are called. /// The buffer is not resized. explicit BufferBuilder(std::shared_ptr buffer, - MemoryPool* pool = default_memory_pool()) + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) : buffer_(std::move(buffer)), pool_(pool), data_(buffer_->mutable_data()), capacity_(buffer_->capacity()), - size_(buffer_->size()) {} + size_(buffer_->size()), + alignment_(alignment) {} /// \brief Resize the buffer to the nearest multiple of 64 bytes /// @@ -71,7 +75,8 @@ class ARROW_EXPORT BufferBuilder { /// \return Status Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { if (buffer_ == NULLPTR) { - ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_)); + ARROW_ASSIGN_OR_RAISE(buffer_, + AllocateResizableBuffer(new_capacity, alignment_, pool_)); } else { ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); } @@ -153,7 +158,7 @@ class ARROW_EXPORT BufferBuilder { if (size_ != 0) buffer_->ZeroPadding(); *out = buffer_; if (*out == NULLPTR) { - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, pool_)); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, alignment_, pool_)); } Reset(); return Status::OK(); @@ -198,6 +203,7 @@ class ARROW_EXPORT BufferBuilder { uint8_t* data_; int64_t capacity_; int64_t size_; + int64_t alignment_; }; template @@ -209,8 +215,9 @@ class TypedBufferBuilder< T, typename std::enable_if::value || std::is_standard_layout::value>::type> { public: - explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool()) - : bytes_builder_(pool) {} + explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : bytes_builder_(pool, alignment) {} explicit TypedBufferBuilder(std::shared_ptr buffer, MemoryPool* pool = default_memory_pool()) @@ -306,8 +313,9 @@ class TypedBufferBuilder< template <> class TypedBufferBuilder { public: - explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool()) - : bytes_builder_(pool) {} + explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : bytes_builder_(pool, alignment) {} explicit TypedBufferBuilder(BufferBuilder builder) : bytes_builder_(std::move(builder)) {} diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index 724db80eba7..ce8bab846d5 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -34,7 +34,6 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/make_unique.h" namespace arrow { @@ -162,7 +161,7 @@ Result> MyMemoryManager::CopyNonOwnedFrom( ARROW_ASSIGN_OR_RAISE(auto dest, MemoryManager::CopyNonOwned(buf, default_cpu_memory_manager())); // 2. Wrap CPU buffer result - return internal::make_unique(shared_from_this(), std::move(dest)); + return std::make_unique(shared_from_this(), std::move(dest)); } return nullptr; } @@ -204,8 +203,8 @@ Result> MyMemoryManager::ViewBufferTo( } // Like AssertBufferEqual, but doesn't call Buffer::data() -void AssertMyBufferEqual(const Buffer& buffer, util::string_view expected) { - ASSERT_EQ(util::string_view(buffer), expected); +void AssertMyBufferEqual(const Buffer& buffer, std::string_view expected) { + ASSERT_EQ(std::string_view(buffer), expected); } void AssertIsCPUBuffer(const Buffer& buf) { @@ -398,6 +397,15 @@ TEST(TestBuffer, FromStdString) { ASSERT_EQ(static_cast(val.size()), buf.size()); } +TEST(TestBuffer, Alignment) { + std::string val = "hello, world"; + + constexpr int64_t kAlignmentTest = 1024; + ASSERT_OK_AND_ASSIGN(std::unique_ptr buf, + AllocateBuffer(val.size(), kAlignmentTest)); + ASSERT_EQ(buf->address() % kAlignmentTest, 0); +} + TEST(TestBuffer, FromStdStringWithMemory) { std::string expected = "hello, world"; std::shared_ptr buf; @@ -709,6 +717,37 @@ TEST(TestBufferBuilder, ResizeReserve) { ASSERT_EQ(9, builder.length()); } +TEST(TestBufferBuilder, Alignment) { + const std::string data = "some data"; + auto data_ptr = data.c_str(); + + constexpr int kTestAlignment = 512; + BufferBuilder builder(default_memory_pool(), /*alignment=*/kTestAlignment); +#define TEST_ALIGNMENT() \ + ASSERT_EQ(reinterpret_cast(builder.data()) % kTestAlignment, 0) + + ASSERT_OK(builder.Append(data_ptr, 9)); + TEST_ALIGNMENT(); + + ASSERT_OK(builder.Resize(128)); + ASSERT_EQ(128, builder.capacity()); + ASSERT_EQ(9, builder.length()); + TEST_ALIGNMENT(); + + // Do not shrink to fit + ASSERT_OK(builder.Resize(64, false)); + TEST_ALIGNMENT(); + + // Shrink to fit + ASSERT_OK(builder.Resize(64)); + TEST_ALIGNMENT(); + + // Reserve elements + ASSERT_OK(builder.Reserve(60)); + TEST_ALIGNMENT(); +#undef TEST_ALIGNMENT +} + TEST(TestBufferBuilder, Finish) { const std::string data = "some data"; auto data_ptr = data.c_str(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 779722e0d1c..45ba4e8b700 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -17,6 +17,7 @@ #include "arrow/builder.h" +#include #include #include #include @@ -25,7 +26,6 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" -#include "arrow/util/make_unique.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -42,40 +42,42 @@ using arrow::internal::checked_cast; // exact_index_type case below, to reduce build time and memory usage. class ARROW_EXPORT TypeErasedIntBuilder : public ArrayBuilder { public: - explicit TypeErasedIntBuilder(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool) { + explicit TypeErasedIntBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment) { // Not intended to be used, but adding this is easier than adding a bunch of enable_if // magic to builder_dict.h DCHECK(false); } explicit TypeErasedIntBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()) + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool), type_id_(type->id()) { DCHECK(is_integer(type_id_)); switch (type_id_) { case Type::UINT8: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::INT8: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::UINT16: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::INT16: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::UINT32: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::INT32: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::UINT64: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; case Type::INT64: - builder_ = internal::make_unique(pool); + builder_ = std::make_unique(pool); break; default: DCHECK(false); diff --git a/cpp/src/arrow/builder_benchmark.cc b/cpp/src/arrow/builder_benchmark.cc index c131f813927..cf3e7f32d5e 100644 --- a/cpp/src/arrow/builder_benchmark.cc +++ b/cpp/src/arrow/builder_benchmark.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "benchmark/benchmark.h" @@ -30,12 +31,12 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/decimal.h" -#include "arrow/util/string_view.h" namespace arrow { using ValueType = int64_t; using VectorType = std::vector; + constexpr int64_t kNumberOfElements = 256 * 512; static VectorType AlmostU8CompressibleVector() { @@ -54,7 +55,7 @@ constexpr int64_t kBytesProcessPerRound = kNumberOfElements * sizeof(ValueType); constexpr int64_t kBytesProcessed = kRounds * kBytesProcessPerRound; static const char* kBinaryString = "12345678"; -static arrow::util::string_view kBinaryView(kBinaryString); +static std::string_view kBinaryView(kBinaryString); static void BuildIntArrayNoNulls(benchmark::State& state) { // NOLINT non-const reference for (auto _ : state) { diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index de531dbc607..d6ea60f520e 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include "arrow/c/util_internal.h" #include "arrow/extension_type.h" #include "arrow/memory_pool.h" +#include "arrow/memory_pool_internal.h" // for kZeroSizeArea #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/stl_allocator.h" @@ -40,7 +42,7 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/small_vector.h" -#include "arrow/util/string_view.h" +#include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_type_inline.h" @@ -57,6 +59,10 @@ using internal::ArrayExportTraits; using internal::SchemaExportGuard; using internal::SchemaExportTraits; +using internal::ToChars; + +using memory_pool::internal::kZeroSizeArea; + namespace { Status ExportingNotImplemented(const DataType& type) { @@ -334,18 +340,16 @@ struct SchemaExporter { Status Visit(const DoubleType& type) { return SetFormat("g"); } Status Visit(const FixedSizeBinaryType& type) { - return SetFormat("w:" + std::to_string(type.byte_width())); + return SetFormat("w:" + ToChars(type.byte_width())); } Status Visit(const DecimalType& type) { if (type.bit_width() == 128) { // 128 is the default bit-width - return SetFormat("d:" + std::to_string(type.precision()) + "," + - std::to_string(type.scale())); + return SetFormat("d:" + ToChars(type.precision()) + "," + ToChars(type.scale())); } else { - return SetFormat("d:" + std::to_string(type.precision()) + "," + - std::to_string(type.scale()) + "," + - std::to_string(type.bit_width())); + return SetFormat("d:" + ToChars(type.precision()) + "," + ToChars(type.scale()) + + "," + ToChars(type.bit_width())); } } @@ -441,7 +445,7 @@ struct SchemaExporter { Status Visit(const LargeListType& type) { return SetFormat("+L"); } Status Visit(const FixedSizeListType& type) { - return SetFormat("+w:" + std::to_string(type.list_size())); + return SetFormat("+w:" + ToChars(type.list_size())); } Status Visit(const StructType& type) { return SetFormat("+s"); } @@ -468,7 +472,7 @@ struct SchemaExporter { if (!first) { s += ","; } - s += std::to_string(code); + s += ToChars(code); first = false; } return Status::OK(); @@ -666,7 +670,7 @@ namespace { static constexpr int64_t kMaxImportRecursionLevel = 64; -Status InvalidFormatString(util::string_view v) { +Status InvalidFormatString(std::string_view v) { return Status::Invalid("Invalid or unsupported format string: '", v, "'"); } @@ -674,13 +678,13 @@ class FormatStringParser { public: FormatStringParser() {} - explicit FormatStringParser(util::string_view v) : view_(v), index_(0) {} + explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {} bool AtEnd() const { return index_ >= view_.length(); } char Next() { return view_[index_++]; } - util::string_view Rest() { return view_.substr(index_); } + std::string_view Rest() { return view_.substr(index_); } Status CheckNext(char c) { if (AtEnd() || Next() != c) { @@ -704,7 +708,7 @@ class FormatStringParser { } template - Result ParseInt(util::string_view v) { + Result ParseInt(std::string_view v) { using ArrowIntType = typename CTypeTraits::ArrowType; IntType value; if (!internal::ParseValue(v.data(), v.size(), &value)) { @@ -729,13 +733,13 @@ class FormatStringParser { } } - SmallVector Split(util::string_view v, char delim = ',') { - SmallVector parts; + SmallVector Split(std::string_view v, char delim = ',') { + SmallVector parts; size_t start = 0, end; while (true) { end = v.find_first_of(delim, start); parts.push_back(v.substr(start, end - start)); - if (end == util::string_view::npos) { + if (end == std::string_view::npos) { break; } start = end + 1; @@ -744,9 +748,10 @@ class FormatStringParser { } template - Result> ParseInts(util::string_view v) { - auto parts = Split(v); + Result> ParseInts(std::string_view v) { std::vector result; + if (v.empty()) return result; + auto parts = Split(v); result.reserve(parts.size()); for (const auto& p : parts) { ARROW_ASSIGN_OR_RAISE(auto i, ParseInt(p)); @@ -758,7 +763,7 @@ class FormatStringParser { Status Invalid() { return InvalidFormatString(view_); } protected: - util::string_view view_; + std::string_view view_; size_t index_; }; @@ -1263,7 +1268,8 @@ class ImportedBuffer : public Buffer { }; struct ArrayImporter { - explicit ArrayImporter(const std::shared_ptr& type) : type_(type) {} + explicit ArrayImporter(const std::shared_ptr& type) + : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} Status Import(struct ArrowArray* src) { if (ArrowArrayIsReleased(src)) { @@ -1527,7 +1533,7 @@ struct ArrayImporter { } Status ImportNullBitmap(int32_t buffer_id = 0) { - RETURN_NOT_OK(ImportBitsBuffer(buffer_id)); + RETURN_NOT_OK(ImportBitsBuffer(buffer_id, /*is_null_bitmap=*/true)); if (data_->null_count > 0 && data_->buffers[buffer_id] == nullptr) { return Status::Invalid( "ArrowArray struct has null bitmap buffer but non-zero null_count ", @@ -1536,15 +1542,20 @@ struct ArrayImporter { return Status::OK(); } - Status ImportBitsBuffer(int32_t buffer_id) { + Status ImportBitsBuffer(int32_t buffer_id, bool is_null_bitmap = false) { // Compute visible size of buffer - int64_t buffer_size = bit_util::BytesForBits(c_struct_->length + c_struct_->offset); - return ImportBuffer(buffer_id, buffer_size); + int64_t buffer_size = + (c_struct_->length > 0) + ? bit_util::BytesForBits(c_struct_->length + c_struct_->offset) + : 0; + return ImportBuffer(buffer_id, buffer_size, is_null_bitmap); } Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) { // Compute visible size of buffer - int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset); + int64_t buffer_size = (c_struct_->length > 0) + ? byte_width * (c_struct_->length + c_struct_->offset) + : 0; return ImportBuffer(buffer_id, buffer_size); } @@ -1561,17 +1572,27 @@ struct ArrayImporter { int64_t byte_width = 1) { auto offsets = data_->GetValues(offsets_buffer_id); // Compute visible size of buffer - int64_t buffer_size = byte_width * offsets[c_struct_->length]; + int64_t buffer_size = + (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; return ImportBuffer(buffer_id, buffer_size); } - Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) { + Status ImportBuffer(int32_t buffer_id, int64_t buffer_size, + bool is_null_bitmap = false) { std::shared_ptr* out = &data_->buffers[buffer_id]; auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); if (data != nullptr) { *out = std::make_shared(data, buffer_size, import_); - } else { + } else if (is_null_bitmap) { out->reset(); + } else { + // Ensure that imported buffers are never null (except for the null bitmap) + if (buffer_size != 0) { + return Status::Invalid( + "ArrowArrayStruct contains null data pointer " + "for a buffer with non-zero computed size"); + } + *out = zero_size_buffer_; } return Status::OK(); } @@ -1583,6 +1604,9 @@ struct ArrayImporter { std::shared_ptr import_; std::shared_ptr data_; std::vector child_importers_; + + // For imported null buffer pointers + std::shared_ptr zero_size_buffer_; }; } // namespace @@ -1742,7 +1766,9 @@ namespace { class ArrayStreamBatchReader : public RecordBatchReader { public: - explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) { + explicit ArrayStreamBatchReader(std::shared_ptr schema, + struct ArrowArrayStream* stream) + : schema_(std::move(schema)) { ArrowArrayStreamMove(stream, &stream_); DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } @@ -1754,7 +1780,7 @@ class ArrayStreamBatchReader : public RecordBatchReader { DCHECK(ArrowArrayStreamIsReleased(&stream_)); } - std::shared_ptr schema() const override { return CacheSchema(); } + std::shared_ptr schema() const override { return schema_; } Status ReadNext(std::shared_ptr* batch) override { struct ArrowArray c_array; @@ -1764,7 +1790,7 @@ class ArrayStreamBatchReader : public RecordBatchReader { batch->reset(); return Status::OK(); } else { - return ImportRecordBatch(&c_array, CacheSchema()).Value(batch); + return ImportRecordBatch(&c_array, schema_).Value(batch); } } @@ -1775,17 +1801,30 @@ class ArrayStreamBatchReader : public RecordBatchReader { return Status::OK(); } - private: - std::shared_ptr CacheSchema() const { - if (!schema_) { - struct ArrowSchema c_schema; - ARROW_CHECK_OK(StatusFromCError(stream_.get_schema(&stream_, &c_schema))); - schema_ = ImportSchema(&c_schema).ValueOrDie(); + static Result> Make( + struct ArrowArrayStream* stream) { + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + std::shared_ptr schema; + struct ArrowSchema c_schema = {}; + auto status = StatusFromCError(stream, stream->get_schema(stream, &c_schema)); + if (status.ok()) { + status = ImportSchema(&c_schema).Value(&schema); + } + if (!status.ok()) { + ArrowArrayStreamRelease(stream); + return status; } - return schema_; + return std::make_shared(std::move(schema), stream); } + private: Status StatusFromCError(int errno_like) const { + return StatusFromCError(&stream_, errno_like); + } + + static Status StatusFromCError(struct ArrowArrayStream* stream, int errno_like) { if (ARROW_PREDICT_TRUE(errno_like == 0)) { return Status::OK(); } @@ -1805,23 +1844,19 @@ class ArrayStreamBatchReader : public RecordBatchReader { code = StatusCode::IOError; break; } - const char* last_error = stream_.get_last_error(&stream_); - return Status(code, last_error ? std::string(last_error) : ""); + const char* last_error = stream->get_last_error(stream); + return {code, last_error ? std::string(last_error) : ""}; } mutable struct ArrowArrayStream stream_; - mutable std::shared_ptr schema_; + std::shared_ptr schema_; }; } // namespace Result> ImportRecordBatchReader( struct ArrowArrayStream* stream) { - if (ArrowArrayStreamIsReleased(stream)) { - return Status::Invalid("Cannot import released ArrowArrayStream"); - } - // XXX should we call get_schema() here to avoid crashing on error? - return std::make_shared(stream); + return ArrayStreamBatchReader::Make(stream); } } // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bb722c52b67..90fe9d59657 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,7 +41,6 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/string_view.h" namespace arrow { @@ -124,6 +124,24 @@ class ReleaseCallback { using SchemaReleaseCallback = ReleaseCallback; using ArrayReleaseCallback = ReleaseCallback; +// Whether c_struct or any of its descendents have non-null data pointers. +bool HasData(const ArrowArray* c_struct) { + for (int64_t i = 0; i < c_struct->n_buffers; ++i) { + if (c_struct->buffers[i] != nullptr) { + return true; + } + } + if (c_struct->dictionary && HasData(c_struct->dictionary)) { + return true; + } + for (int64_t i = 0; i < c_struct->n_children; ++i) { + if (HasData(c_struct->children[i])) { + return true; + } + } + return false; +} + static const std::vector kMetadataKeys1{"key1", "key2"}; static const std::vector kMetadataValues1{"", "bar"}; @@ -408,12 +426,16 @@ TEST_F(TestSchemaExport, Union) { auto type = dense_union({field_a, field_b}, {42, 43}); TestNested(type, {"+ud:42,43", "c", "b"}, {"", "a", "b"}, {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, 0}); + TestNested(dense_union(arrow::FieldVector{}, std::vector{}), {"+ud:"}, {""}, + {ARROW_FLAG_NULLABLE}); // Sparse field_a = field("a", int8(), /*nullable=*/false); field_b = field("b", boolean()); type = sparse_union({field_a, field_b}, {42, 43}); TestNested(type, {"+us:42,43", "c", "b"}, {"", "a", "b"}, {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); + TestNested(sparse_union(arrow::FieldVector{}, std::vector{}), {"+us:"}, {""}, + {ARROW_FLAG_NULLABLE}); } std::string GetIndexFormat(Type::type type_id) { @@ -1655,6 +1677,8 @@ static const uint8_t bits_buffer1[] = {0xed, 0xed}; static const void* buffers_no_nulls_no_data[1] = {nullptr}; static const void* buffers_nulls_no_data1[1] = {bits_buffer1}; +static const void* all_buffers_omitted[3] = {nullptr, nullptr, nullptr}; + static const uint8_t data_buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz"; @@ -1720,10 +1744,13 @@ static const uint8_t string_data_buffer1[] = "foobarquuxxyzzy"; static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10, 15}; static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1, string_data_buffer1}; +static const void* string_buffers_omitted[3] = {nullptr, string_offsets_buffer1, nullptr}; static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10}; static const void* large_string_buffers_no_nulls1[3] = { nullptr, large_string_offsets_buffer1, string_data_buffer1}; +static const void* large_string_buffers_omitted[3] = { + nullptr, large_string_offsets_buffer1, nullptr}; static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; @@ -1897,9 +1924,9 @@ class TestArrayImport : public ::testing::Test { Reset(); // for further tests ASSERT_OK(array->ValidateFull()); - // Special case: Null array doesn't have any data, so it needn't - // keep the ArrowArray struct alive. - if (type->id() != Type::NA) { + // Special case: arrays without data (such as Null arrays) needn't keep + // the ArrowArray struct alive. + if (HasData(&c_struct_)) { cb.AssertNotCalled(); } AssertArraysEqual(*expected, *array, true); @@ -1986,6 +2013,10 @@ TEST_F(TestArrayImport, Primitive) { CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); FillPrimitive(3, 1, 0, primitive_buffers_nulls1_8); CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(int32(), "[]")); } TEST_F(TestArrayImport, Temporal) { @@ -2066,6 +2097,12 @@ TEST_F(TestArrayImport, PrimitiveWithOffset) { FillPrimitive(4, 0, 7, primitive_buffers_no_nulls1_8); CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 2, all_buffers_omitted); + CheckImport(ArrayFromJSON(int32(), "[]")); + FillPrimitive(0, 0, 3, all_buffers_omitted); + CheckImport(ArrayFromJSON(boolean(), "[]")); } TEST_F(TestArrayImport, NullWithOffset) { @@ -2088,10 +2125,48 @@ TEST_F(TestArrayImport, String) { FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + // Empty array with null data pointers + FillStringLike(0, 0, 0, string_buffers_omitted); + CheckImport(ArrayFromJSON(utf8(), "[]")); + FillStringLike(0, 0, 0, large_string_buffers_omitted); + CheckImport(ArrayFromJSON(large_binary(), "[]")); +} + +TEST_F(TestArrayImport, StringWithOffset) { + FillStringLike(3, 0, 1, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])")); + FillStringLike(2, 0, 2, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_utf8(), R"(["bar", "quux"])")); + + // Empty array with null data pointers + FillStringLike(0, 0, 1, string_buffers_omitted); + CheckImport(ArrayFromJSON(utf8(), "[]")); +} + +TEST_F(TestArrayImport, FixedSizeBinary) { FillPrimitive(2, 0, 0, primitive_buffers_no_nulls2); CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])")); FillPrimitive(2, 0, 0, primitive_buffers_no_nulls3); CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]")); + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(decimal(15, 4), "[]")); +} + +TEST_F(TestArrayImport, FixedSizeBinaryWithOffset) { + FillPrimitive(1, 0, 1, primitive_buffers_no_nulls2); + CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["def"])")); + FillPrimitive(1, 0, 1, primitive_buffers_no_nulls3); + CheckImport(ArrayFromJSON(decimal(15, 4), R"(["98765.4321"])")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 1, all_buffers_omitted); + CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]")); + FillPrimitive(0, 0, 1, all_buffers_omitted); + CheckImport(ArrayFromJSON(decimal(15, 4), "[]")); } TEST_F(TestArrayImport, List) { @@ -2113,6 +2188,11 @@ TEST_F(TestArrayImport, List) { FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data); CheckImport( ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]")); + + // Empty child array with null data pointers + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillFixedSizeListLike(0, 0, 0, buffers_no_nulls_no_data); + CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), "[]")); } TEST_F(TestArrayImport, NestedList) { @@ -2201,6 +2281,15 @@ TEST_F(TestArrayImport, SparseUnion) { FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers1_legacy, /*legacy=*/true); CheckImport(expected); + + // Empty array with null data pointers + expected = ArrayFromJSON(type, "[]"); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::SPARSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::SPARSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false); } TEST_F(TestArrayImport, DenseUnion) { @@ -2219,6 +2308,15 @@ TEST_F(TestArrayImport, DenseUnion) { FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers1_legacy, /*legacy=*/true); CheckImport(expected); + + // Empty array with null data pointers + expected = ArrayFromJSON(type, "[]"); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::DENSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::DENSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false); } TEST_F(TestArrayImport, StructWithOffset) { @@ -2355,6 +2453,29 @@ TEST_F(TestArrayImport, PrimitiveError) { // Zero null bitmap but non-zero null_count FillPrimitive(3, 1, 0, primitive_buffers_no_nulls1_8); CheckImportError(int8()); + + // Null data pointers with non-zero length + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(int8()); + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(boolean()); + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(fixed_size_binary(3)); +} + +TEST_F(TestArrayImport, StringError) { + // Bad number of buffers + FillStringLike(4, 0, 0, string_buffers_no_nulls1); + c_struct_.n_buffers = 2; + CheckImportError(utf8()); + + // Null data pointers with non-zero length + FillStringLike(4, 0, 0, string_buffers_omitted); + CheckImportError(utf8()); + + // Null offsets pointer + FillStringLike(0, 0, 0, all_buffers_omitted); + CheckImportError(utf8()); } TEST_F(TestArrayImport, StructError) { @@ -2365,6 +2486,13 @@ TEST_F(TestArrayImport, StructError) { CheckImportError(struct_({field("strs", utf8())})); } +TEST_F(TestArrayImport, ListError) { + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListLike(0, 0, 0, all_buffers_omitted); + CheckImportError(list(int8())); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -2625,6 +2753,7 @@ TEST_F(TestSchemaRoundtrip, Struct) { TestWithTypeFactory([&]() { return struct_({f1, f2}); }); f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); TestWithTypeFactory([&]() { return struct_({f1, f2}); }); + TestWithTypeFactory([&]() { return struct_(arrow::FieldVector{}); }); } TEST_F(TestSchemaRoundtrip, Union) { @@ -2632,6 +2761,10 @@ TEST_F(TestSchemaRoundtrip, Union) { auto f2 = field("f2", list(decimal(19, 4))); auto type_codes = std::vector{42, 43}; + TestWithTypeFactory( + [&]() { return dense_union(arrow::FieldVector{}, std::vector{}); }); + TestWithTypeFactory( + [&]() { return sparse_union(arrow::FieldVector{}, std::vector{}); }); TestWithTypeFactory([&]() { return sparse_union({f1, f2}, type_codes); }); f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); TestWithTypeFactory([&]() { return dense_union({f1, f2}, type_codes); }); @@ -2850,8 +2983,10 @@ TEST_F(TestArrayRoundtrip, UnknownNullCount) { TEST_F(TestArrayRoundtrip, List) { TestWithJSON(list(int32()), "[]"); TestWithJSON(list(int32()), "[[4, 5], [6, null], null]"); + TestWithJSON(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]"); + TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } TEST_F(TestArrayRoundtrip, Struct) { @@ -2871,6 +3006,12 @@ TEST_F(TestArrayRoundtrip, Struct) { TestWithJSON(type, "[[4, true], [5, null]]"); TestWithJSONSliced(type, "[[4, true], [5, null], [6, false]]"); + + // With no fields + type = struct_({}); + TestWithJSON(type, "[]"); + TestWithJSON(type, "[[], null, [], null, []]"); + TestWithJSONSliced(type, "[[], null, [], null, []]"); } TEST_F(TestArrayRoundtrip, Map) { @@ -2898,6 +3039,15 @@ TEST_F(TestArrayRoundtrip, Union) { TestWithJSON(type, json); TestWithJSONSliced(type, json); } + + // With no fields + fields = {}; + type_codes = {}; + union_types = {sparse_union(fields, type_codes), dense_union(fields, type_codes)}; + + for (const auto& type : union_types) { + TestWithJSON(type, "[]"); + } } TEST_F(TestArrayRoundtrip, Dictionary) { @@ -3228,4 +3378,37 @@ TEST_F(TestArrayStreamRoundtrip, Errors) { }); } +TEST_F(TestArrayStreamRoundtrip, SchemaError) { + struct StreamState { + bool released = false; + + static const char* GetLastError(struct ArrowArrayStream* stream) { + return "Expected error"; + } + + static int GetSchema(struct ArrowArrayStream* stream, struct ArrowSchema* schema) { + return EIO; + } + + static int GetNext(struct ArrowArrayStream* stream, struct ArrowArray* array) { + return EINVAL; + } + + static void Release(struct ArrowArrayStream* stream) { + reinterpret_cast(stream->private_data)->released = true; + std::memset(stream, 0, sizeof(*stream)); + } + } state; + struct ArrowArrayStream stream = {}; + stream.get_last_error = &StreamState::GetLastError; + stream.get_schema = &StreamState::GetSchema; + stream.get_next = &StreamState::GetNext; + stream.release = &StreamState::Release; + stream.private_data = &state; + + EXPECT_RAISES_WITH_MESSAGE_THAT(IOError, ::testing::HasSubstr("Expected error"), + ImportRecordBatchReader(&stream)); + ASSERT_TRUE(state.released); +} + } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 1a63d26c24d..818070ffe35 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -32,7 +32,7 @@ struct ChunkLocation { }; // An object that resolves an array chunk depending on a logical index -struct ChunkResolver { +struct ARROW_EXPORT ChunkResolver { explicit ChunkResolver(const ArrayVector& chunks); explicit ChunkResolver(const std::vector& chunks); diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 840dd04a5ad..c5e6d7fa4bd 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -72,7 +72,7 @@ Result> ChunkedArray::Make(ArrayVector chunks, } for (const auto& chunk : chunks) { if (!chunk->type()->Equals(*type)) { - return Status::Invalid("Array chunks must all be same type"); + return Status::TypeError("Array chunks must all be same type"); } } return std::make_shared(std::move(chunks), std::move(type)); diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index d1dc69de274..08410b4cd53 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -65,8 +65,8 @@ TEST_F(TestChunkedArray, Make) { ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8())); AssertChunkedEqual(*result, *result2); - ASSERT_RAISES(Invalid, ChunkedArray::Make({chunk0, chunk1})); - ASSERT_RAISES(Invalid, ChunkedArray::Make({chunk0}, int16())); + ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1})); + ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0}, int16())); } TEST_F(TestChunkedArray, MakeEmpty) { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index c5406ee583f..fa83426ab7f 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -43,6 +43,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/memory.h" @@ -394,26 +395,6 @@ class RangeDataEqualsImpl { } protected: - // For CompareFloating (templated local classes or lambdas not supported in C++11) - template - struct ComparatorVisitor { - RangeDataEqualsImpl* impl; - const CType* left_values; - const CType* right_values; - - template - void operator()(CompareFunction&& compare) { - impl->VisitValues([&](int64_t i) { - const CType x = left_values[i + impl->left_start_idx_]; - const CType y = right_values[i + impl->right_start_idx_]; - return compare(x, y); - }); - } - }; - - template - friend struct ComparatorVisitor; - template Status ComparePrimitive(const TypeClass&) { const CType* left_values = left_.GetValues(1); @@ -431,8 +412,14 @@ class RangeDataEqualsImpl { const CType* left_values = left_.GetValues(1); const CType* right_values = right_.GetValues(1); - ComparatorVisitor visitor{this, left_values, right_values}; - VisitFloatingEquality(options_, floating_approximate_, visitor); + auto visitor = [&](auto&& compare_func) { + VisitValues([&](int64_t i) { + const CType x = left_values[i + left_start_idx_]; + const CType y = right_values[i + right_start_idx_]; + return compare_func(x, y); + }); + }; + VisitFloatingEquality(options_, floating_approximate_, std::move(visitor)); return Status::OK(); } @@ -573,6 +560,14 @@ class TypeEqualsVisitor { explicit TypeEqualsVisitor(const DataType& right, bool check_metadata) : right_(right), check_metadata_(check_metadata), result_(false) {} + bool MetadataEqual(const Field& left, const Field& right) { + if (left.HasMetadata() && right.HasMetadata()) { + return left.metadata()->Equals(*right.metadata()); + } else { + return !left.HasMetadata() && !right.HasMetadata(); + } + } + Status VisitChildren(const DataType& left) { if (left.num_fields() != right_.num_fields()) { result_ = false; @@ -640,8 +635,21 @@ class TypeEqualsVisitor { } template - enable_if_t::value || is_struct_type::value, Status> Visit( - const T& left) { + enable_if_t::value, Status> Visit(const T& left) { + std::shared_ptr left_field = left.field(0); + std::shared_ptr right_field = checked_cast(right_).field(0); + bool equal_names = !check_metadata_ || (left_field->name() == right_field->name()); + bool equal_metadata = !check_metadata_ || MetadataEqual(*left_field, *right_field); + + result_ = equal_names && equal_metadata && + (left_field->nullable() == right_field->nullable()) && + left_field->type()->Equals(*right_field->type(), check_metadata_); + + return Status::OK(); + } + + template + enable_if_t::value, Status> Visit(const T& left) { return VisitChildren(left); } @@ -651,6 +659,18 @@ class TypeEqualsVisitor { result_ = false; return Status::OK(); } + if (check_metadata_ && (left.item_field()->name() != right.item_field()->name() || + left.key_field()->name() != right.key_field()->name() || + left.value_field()->name() != right.value_field()->name())) { + result_ = false; + return Status::OK(); + } + if (check_metadata_ && !(MetadataEqual(*left.item_field(), *right.item_field()) && + MetadataEqual(*left.key_field(), *right.key_field()) && + MetadataEqual(*left.value_field(), *right.value_field()))) { + result_ = false; + return Status::OK(); + } result_ = left.key_type()->Equals(*right.key_type(), check_metadata_) && left.item_type()->Equals(*right.item_type(), check_metadata_); return Status::OK(); @@ -827,26 +847,15 @@ class ScalarEqualsVisitor { bool result() const { return result_; } protected: - // For CompareFloating (templated local classes or lambdas not supported in C++11) - template - struct ComparatorVisitor { - const ScalarType& left; - const ScalarType& right; - bool* result; - - template - void operator()(CompareFunction&& compare) { - *result = compare(left.value, right.value); - } - }; - template Status CompareFloating(const ScalarType& left) { using CType = decltype(left.value); + const auto& right = checked_cast(right_); - ComparatorVisitor visitor{left, checked_cast(right_), - &result_}; - VisitFloatingEquality(options_, floating_approximate_, visitor); + auto visitor = [&](auto&& compare_func) { + result_ = compare_func(left.value, right.value); + }; + VisitFloatingEquality(options_, floating_approximate_, std::move(visitor)); return Status::OK(); } @@ -1045,33 +1054,6 @@ bool IntegerTensorEquals(const Tensor& left, const Tensor& right) { return are_equal; } -template -struct StridedFloatTensorLastDimEquality { - int64_t n_values; - const uint8_t* left_data; - const uint8_t* right_data; - int64_t left_offset; - int64_t right_offset; - int64_t left_stride; - int64_t right_stride; - bool result; - - template - void operator()(EqualityFunc&& eq) { - for (int64_t i = 0; i < n_values; ++i) { - T left_value = - *reinterpret_cast(left_data + left_offset + i * left_stride); - T right_value = - *reinterpret_cast(right_data + right_offset + i * right_stride); - if (!eq(left_value, right_value)) { - result = false; - return; - } - } - result = true; - } -}; - template bool StridedFloatTensorContentEquals(const int dim_index, int64_t left_offset, int64_t right_offset, const Tensor& left, @@ -1085,11 +1067,26 @@ bool StridedFloatTensorContentEquals(const int dim_index, int64_t left_offset, const auto right_stride = right.strides()[dim_index]; if (dim_index == left.ndim() - 1) { // Leaf dimension, compare values - StridedFloatTensorLastDimEquality visitor{ - n, left.raw_data(), right.raw_data(), left_offset, right_offset, - left_stride, right_stride, /*result=*/false}; - VisitFloatingEquality(opts, /*floating_approximate=*/false, visitor); - return visitor.result; + auto left_data = left.raw_data(); + auto right_data = right.raw_data(); + bool result = true; + + auto visitor = [&](auto&& compare_func) { + for (int64_t i = 0; i < n; ++i) { + c_type left_value = + *reinterpret_cast(left_data + left_offset + i * left_stride); + c_type right_value = *reinterpret_cast(right_data + right_offset + + i * right_stride); + if (!compare_func(left_value, right_value)) { + result = false; + return; + } + } + }; + + VisitFloatingEquality(opts, /*floating_approximate=*/false, + std::move(visitor)); + return result; } // Outer dimension, recurse into inner diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 13179952326..6dbacfa86af 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -92,31 +92,31 @@ class EqualOptions { }; /// Returns true if the arrays are exactly equal -bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, +ARROW_EXPORT bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) -bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, +ARROW_EXPORT bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if indicated equal-length segment of arrays are exactly equal -bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, +ARROW_EXPORT bool ArrayRangeEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if indicated equal-length segment of arrays are approximately equal -bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right, +ARROW_EXPORT bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const EqualOptions& = EqualOptions::Defaults()); -bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right, +ARROW_EXPORT bool TensorEquals(const Tensor& left, const Tensor& right, const EqualOptions& = EqualOptions::Defaults()); /// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal -bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right, +ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if the type metadata are exactly equal @@ -124,22 +124,22 @@ bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTenso /// \param[in] right a DataType /// \param[in] check_metadata whether to compare KeyValueMetadata for child /// fields -bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, +ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata = true); /// Returns true if scalars are equal /// \param[in] left a Scalar /// \param[in] right a Scalar /// \param[in] options comparison options -bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right, +ARROW_EXPORT bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options = EqualOptions::Defaults()); /// Returns true if scalars are approximately equal /// \param[in] left a Scalar /// \param[in] right a Scalar /// \param[in] options comparison options -bool ARROW_EXPORT -ScalarApproxEquals(const Scalar& left, const Scalar& right, - const EqualOptions& options = EqualOptions::Defaults()); +ARROW_EXPORT bool ScalarApproxEquals( + const Scalar& left, const Scalar& right, + const EqualOptions& options = EqualOptions::Defaults()); } // namespace arrow diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index 80582e47b74..ba8d26da4d5 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -28,7 +28,6 @@ #include "arrow/compute/api_scalar.h" // IWYU pragma: export #include "arrow/compute/api_vector.h" // IWYU pragma: export #include "arrow/compute/cast.h" // IWYU pragma: export -#include "arrow/compute/exec.h" // IWYU pragma: export #include "arrow/compute/function.h" // IWYU pragma: export #include "arrow/compute/kernel.h" // IWYU pragma: export #include "arrow/compute/registry.h" // IWYU pragma: export @@ -52,3 +51,10 @@ /// @} #include "arrow/compute/row/grouper.h" // IWYU pragma: export + +/// \defgroup execnode-components Components associated with ExecNode +/// @{ +/// @} + +#include "arrow/compute/exec.h" // IWYU pragma: export +#include "arrow/compute/exec/exec_plan.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 3bdff691778..425274043ed 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -18,6 +18,7 @@ #include "arrow/compute/api_scalar.h" #include +#include #include #include @@ -345,6 +346,11 @@ static auto kSetLookupOptionsType = GetFunctionOptionsType( static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), DataMember("step", &SliceOptions::step)); +static auto kListSliceOptionsType = GetFunctionOptionsType( + DataMember("start", &ListSliceOptions::start), + DataMember("stop", &ListSliceOptions::stop), + DataMember("step", &ListSliceOptions::step), + DataMember("return_fixed_size_list", &ListSliceOptions::return_fixed_size_list)); static auto kSplitPatternOptionsType = GetFunctionOptionsType( DataMember("pattern", &SplitPatternOptions::pattern), DataMember("max_splits", &SplitPatternOptions::max_splits), @@ -359,7 +365,7 @@ static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("unit", &StrptimeOptions::unit), DataMember("error_is_null", &StrptimeOptions::error_is_null)); static auto kStructFieldOptionsType = GetFunctionOptionsType( - DataMember("indices", &StructFieldOptions::indices)); + DataMember("field_ref", &StructFieldOptions::field_ref)); static auto kTrimOptionsType = GetFunctionOptionsType( DataMember("characters", &TrimOptions::characters)); static auto kUtf8NormalizeOptionsType = GetFunctionOptionsType( @@ -528,6 +534,17 @@ SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step) SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {} constexpr char SliceOptions::kTypeName[]; +ListSliceOptions::ListSliceOptions(int64_t start, std::optional stop, + int64_t step, + std::optional return_fixed_size_list) + : FunctionOptions(internal::kListSliceOptionsType), + start(start), + stop(stop), + step(step), + return_fixed_size_list(return_fixed_size_list) {} +ListSliceOptions::ListSliceOptions() : ListSliceOptions(0) {} +constexpr char ListSliceOptions::kTypeName[]; + SplitOptions::SplitOptions(int64_t max_splits, bool reverse) : FunctionOptions(internal::kSplitOptionsType), max_splits(max_splits), @@ -561,8 +578,13 @@ StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::MICRO, false) constexpr char StrptimeOptions::kTypeName[]; StructFieldOptions::StructFieldOptions(std::vector indices) - : FunctionOptions(internal::kStructFieldOptionsType), indices(std::move(indices)) {} -StructFieldOptions::StructFieldOptions() : StructFieldOptions(std::vector()) {} + : FunctionOptions(internal::kStructFieldOptionsType), field_ref(std::move(indices)) {} +StructFieldOptions::StructFieldOptions(std::initializer_list indices) + : FunctionOptions(internal::kStructFieldOptionsType), field_ref(std::move(indices)) {} +StructFieldOptions::StructFieldOptions(FieldRef ref) + : FunctionOptions(internal::kStructFieldOptionsType), field_ref(std::move(ref)) {} +StructFieldOptions::StructFieldOptions() + : FunctionOptions(internal::kStructFieldOptionsType) {} constexpr char StructFieldOptions::kTypeName[]; TrimOptions::TrimOptions(std::string characters) @@ -597,6 +619,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMapLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); @@ -653,6 +676,7 @@ SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked") SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked") SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked") SCALAR_EAGER_UNARY(Atan, "atan") +SCALAR_EAGER_UNARY(Exp, "exp") SCALAR_EAGER_UNARY(Sign, "sign") Result Round(const Datum& arg, RoundOptions options, ExecContext* ctx) { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 7d86a555ec8..1c27757fcfc 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -20,6 +20,7 @@ #pragma once +#include #include #include @@ -277,12 +278,13 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { class ARROW_EXPORT StructFieldOptions : public FunctionOptions { public: explicit StructFieldOptions(std::vector indices); + explicit StructFieldOptions(std::initializer_list); + explicit StructFieldOptions(FieldRef field_ref); StructFieldOptions(); static constexpr char const kTypeName[] = "StructFieldOptions"; - /// The child indices to extract. For instance, to get the 2nd child - /// of the 1st child of a struct or union, this would be {0, 1}. - std::vector indices; + /// The FieldRef specifying what to extract from struct or union. + FieldRef field_ref; }; class ARROW_EXPORT StrptimeOptions : public FunctionOptions { @@ -346,6 +348,25 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; +class ARROW_EXPORT ListSliceOptions : public FunctionOptions { + public: + explicit ListSliceOptions(int64_t start, std::optional stop = std::nullopt, + int64_t step = 1, + std::optional return_fixed_size_list = std::nullopt); + ListSliceOptions(); + static constexpr char const kTypeName[] = "ListSliceOptions"; + /// The start of list slicing. + int64_t start; + /// Optional stop of list slicing. If not set, then slice to end. (NotImplemented) + std::optional stop; + /// Slicing step + int64_t step; + // Whether to return a FixedSizeListArray. If true _and_ stop is after + // a list element's length, nulls will be appended to create the requested slice size. + // Default of `nullopt` will return whatever type it got in. + std::optional return_fixed_size_list; +}; + class ARROW_EXPORT NullOptions : public FunctionOptions { public: explicit NullOptions(bool nan_is_null = false); @@ -610,6 +631,15 @@ Result Power(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR); +/// \brief Raise Euler's number to the power of specified exponent, element-wise. +/// If the exponent value is null the result will be null. +/// +/// \param[in] arg the exponent +/// \param[in] ctx the function execution context, optional +/// \return the element-wise Euler's number raised to the power of exponent +ARROW_EXPORT +Result Exp(const Datum& arg, ExecContext* ctx = NULLPTR); + /// \brief Left shift the left array by the right array. Array values must be the /// same length. If either operand is null, the result will be null. /// diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 52aecf3e45a..13bf6f85a48 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -62,6 +62,7 @@ void InitCastTable() { AddCastFunctions(GetNumericCasts()); AddCastFunctions(GetTemporalCasts()); AddCastFunctions(GetDictionaryCasts()); + AddCastFunctions(GetExtensionCasts()); } void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); } @@ -94,9 +95,26 @@ class CastMetaFunction : public MetaFunction { const FunctionOptions* options, ExecContext* ctx) const override { ARROW_ASSIGN_OR_RAISE(auto cast_options, ValidateOptions(options)); - if (args[0].type()->Equals(*cast_options->to_type)) { - return args[0]; + // args[0].type() could be a nullptr so check for that before + // we do anything with it. + if (args[0].type() && args[0].type()->Equals(*cast_options->to_type)) { + // Nested types might differ in field names but still be considered equal, + // so we can only return non-nested types as-is. + if (!is_nested(args[0].type()->id())) { + return args[0]; + } else if (args[0].is_array()) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr array, + ::arrow::internal::GetArrayView( + args[0].array(), cast_options->to_type.owned_type)); + return Datum(array); + } else if (args[0].is_chunked_array()) { + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr array, + args[0].chunked_array()->View(cast_options->to_type.owned_type)); + return Datum(array); + } } + Result> result = GetCastFunction(*cast_options->to_type); if (!result.ok()) { diff --git a/cpp/src/arrow/compute/cast_internal.h b/cpp/src/arrow/compute/cast_internal.h index f00a6cdbf4d..423b791e6a7 100644 --- a/cpp/src/arrow/compute/cast_internal.h +++ b/cpp/src/arrow/compute/cast_internal.h @@ -63,6 +63,7 @@ std::vector> GetTemporalCasts(); std::vector> GetBinaryLikeCasts(); std::vector> GetNestedCasts(); std::vector> GetDictionaryCasts(); +std::vector> GetExtensionCasts(); ARROW_EXPORT Result> GetCastFunction(const DataType& to_type); diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index cf91bada6c6..ee02b26845b 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -33,6 +33,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/kernel.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" @@ -47,7 +48,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" +#include "arrow/util/thread_pool.h" #include "arrow/util/vector.h" namespace arrow { @@ -56,6 +57,7 @@ using internal::BitmapAnd; using internal::checked_cast; using internal::CopyBitmap; using internal::CpuInfo; +using internal::GetCpuThreadPool; namespace compute { @@ -64,6 +66,11 @@ ExecContext* default_exec_context() { return &default_ctx; } +ExecContext* threaded_exec_context() { + static ExecContext threaded_ctx(default_memory_pool(), GetCpuThreadPool()); + return &threaded_ctx; +} + ExecBatch::ExecBatch(const RecordBatch& batch) : values(batch.num_columns()), length(batch.num_rows()) { auto columns = batch.column_data(); @@ -90,15 +97,22 @@ void PrintTo(const ExecBatch& batch, std::ostream* os) { if (value.is_scalar()) { *os << "Scalar[" << value.scalar()->ToString() << "]\n"; - continue; + } else if (value.is_array() || value.is_chunked_array()) { + PrettyPrintOptions options; + options.skip_new_lines = true; + if (value.is_array()) { + auto array = value.make_array(); + *os << "Array"; + ARROW_CHECK_OK(PrettyPrint(*array, options, os)); + } else { + auto array = value.chunked_array(); + *os << "Chunked Array"; + ARROW_CHECK_OK(PrettyPrint(*array, options, os)); + } + *os << "\n"; + } else { + ARROW_DCHECK(false); } - - auto array = value.make_array(); - PrettyPrintOptions options; - options.skip_new_lines = true; - *os << "Array"; - ARROW_CHECK_OK(PrettyPrint(*array, options, os)); - *os << "\n"; } } @@ -119,8 +133,15 @@ std::string ExecBatch::ToString() const { ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const { ExecBatch out = *this; for (auto& value : out.values) { - if (value.is_scalar()) continue; - value = value.array()->Slice(offset, length); + if (value.is_scalar()) { + // keep value as is + } else if (value.is_array()) { + value = value.array()->Slice(offset, length); + } else if (value.is_chunked_array()) { + value = value.chunked_array()->Slice(offset, length); + } else { + ARROW_DCHECK(false); + } } out.length = std::min(length, this->length - offset); return out; @@ -157,6 +178,9 @@ Result ExecBatch::Make(std::vector values) { Result> ExecBatch::ToRecordBatch( std::shared_ptr schema, MemoryPool* pool) const { + if (static_cast(schema->num_fields()) > values.size()) { + return Status::Invalid("ExecBatch::ToTRecordBatch mismatching schema size"); + } ArrayVector columns(schema->num_fields()); for (size_t i = 0; i < columns.size(); ++i) { @@ -164,8 +188,13 @@ Result> ExecBatch::ToRecordBatch( if (value.is_array()) { columns[i] = value.make_array(); continue; + } else if (value.is_scalar()) { + ARROW_ASSIGN_OR_RAISE(columns[i], + MakeArrayFromScalar(*value.scalar(), length, pool)); + } else { + return Status::TypeError("ExecBatch::ToRecordBatch value ", i, " with unsupported ", + "value kind ", ::arrow::ToString(value.kind())); } - ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool)); } return RecordBatch::Make(std::move(schema), length, std::move(columns)); @@ -862,6 +891,7 @@ class ScalarExecutor : public KernelExecutorImpl { } } if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { + data_preallocated_.clear(); ComputeDataPreallocate(*output_type_.type, &data_preallocated_); } @@ -945,6 +975,7 @@ class VectorExecutor : public KernelExecutorImpl { (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE && kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL); if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { + data_preallocated_.clear(); ComputeDataPreallocate(*output_type_.type, &data_preallocated_); } @@ -1098,7 +1129,7 @@ Result> MakeExecutor(ExecContext* ctx, const FunctionOptions* options) { DCHECK_EQ(ExecutorType::function_kind, func->kind()); auto typed_func = checked_cast(func); - return std::unique_ptr(new ExecutorType(ctx, typed_func, options)); + return std::make_unique(ctx, typed_func, options); } } // namespace @@ -1187,15 +1218,15 @@ void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out) { } std::unique_ptr KernelExecutor::MakeScalar() { - return ::arrow::internal::make_unique(); + return std::make_unique(); } std::unique_ptr KernelExecutor::MakeVector() { - return ::arrow::internal::make_unique(); + return std::make_unique(); } std::unique_ptr KernelExecutor::MakeScalarAggregate() { - return ::arrow::internal::make_unique(); + return std::make_unique(); } int64_t InferBatchLength(const std::vector& values, bool* all_same) { @@ -1295,5 +1326,25 @@ Result CallFunction(const std::string& func_name, const ExecBatch& batch, return CallFunction(func_name, batch, /*options=*/nullptr, ctx); } +Result> GetFunctionExecutor( + const std::string& func_name, std::vector in_types, + const FunctionOptions* options, FunctionRegistry* func_registry) { + if (func_registry == NULLPTR) { + func_registry = GetFunctionRegistry(); + } + ARROW_ASSIGN_OR_RAISE(std::shared_ptr func, + func_registry->GetFunction(func_name)); + ARROW_ASSIGN_OR_RAISE(auto func_exec, func->GetBestExecutor(std::move(in_types))); + ARROW_RETURN_NOT_OK(func_exec->Init(options)); + return func_exec; +} + +Result> GetFunctionExecutor( + const std::string& func_name, const std::vector& args, + const FunctionOptions* options, FunctionRegistry* func_registry) { + ARROW_ASSIGN_OR_RAISE(auto in_types, internal::GetFunctionArgumentTypes(args)); + return GetFunctionExecutor(func_name, std::move(in_types), options, func_registry); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index cdd3daf7f74..30f4113f6c4 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -30,8 +30,8 @@ #include "arrow/array/data.h" #include "arrow/compute/exec/expression.h" +#include "arrow/compute/type_fwd.h" #include "arrow/datum.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/type_fwd.h" #include "arrow/util/macros.h" @@ -39,17 +39,8 @@ #include "arrow/util/visibility.h" namespace arrow { -namespace internal { - -class CpuInfo; - -} // namespace internal - namespace compute { -class FunctionOptions; -class FunctionRegistry; - // It seems like 64K might be a good default chunksize to use for execution // based on the experience of other query processing systems. The current // default is not to chunk contiguous arrays, though, but this may change in @@ -127,8 +118,6 @@ class ARROW_EXPORT ExecContext { bool use_threads_ = true; }; -ARROW_EXPORT ExecContext* default_exec_context(); - // TODO: Consider standardizing on uint16 selection vectors and only use them // when we can ensure that each value is 64K length or smaller @@ -174,6 +163,10 @@ class ARROW_EXPORT SelectionVector { /// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight /// than is desirable for this class. Microbenchmarks would help determine for /// sure. See ARROW-8928. + +/// \addtogroup execnode-components +/// @{ + struct ARROW_EXPORT ExecBatch { ExecBatch() = default; ExecBatch(std::vector values, int64_t length) @@ -244,13 +237,13 @@ struct ARROW_EXPORT ExecBatch { } std::string ToString() const; - - ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*); }; inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); } inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); } +ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*); + struct ExecValue { ArraySpan array = {}; const Scalar* scalar = NULLPTR; @@ -309,7 +302,7 @@ struct ExecValue { struct ARROW_EXPORT ExecResult { // The default value of the variant is ArraySpan - util::Variant> value; + std::variant> value; int64_t length() const { if (this->is_array_span()) { @@ -328,12 +321,12 @@ struct ARROW_EXPORT ExecResult { } ArraySpan* array_span() const { - return const_cast(&util::get(this->value)); + return const_cast(&std::get(this->value)); } bool is_array_span() const { return this->value.index() == 0; } const std::shared_ptr& array_data() const { - return util::get>(this->value); + return std::get>(this->value); } bool is_array_data() const { return this->value.index() == 1; } @@ -400,6 +393,8 @@ struct ARROW_EXPORT ExecSpan { std::vector values; }; +/// @} + /// \defgroup compute-call-function One-shot calls to compute functions /// /// @{ @@ -436,5 +431,30 @@ Result CallFunction(const std::string& func_name, const ExecBatch& batch, /// @} +/// \defgroup compute-function-executor One-shot calls to obtain function executors +/// +/// @{ + +/// \brief One-shot executor provider for all types of functions. +/// +/// This function creates and initializes a `FunctionExecutor` appropriate +/// for the given function name, input types and function options. +ARROW_EXPORT +Result> GetFunctionExecutor( + const std::string& func_name, std::vector in_types, + const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR); + +/// \brief One-shot executor provider for all types of functions. +/// +/// This function creates and initializes a `FunctionExecutor` appropriate +/// for the given function name, input types (taken from the Datum arguments) +/// and function options. +ARROW_EXPORT +Result> GetFunctionExecutor( + const std::string& func_name, const std::vector& args, + const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR); + +/// @} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/aggregate.cc b/cpp/src/arrow/compute/exec/aggregate.cc index cc2c464d42b..d5f347f34ae 100644 --- a/cpp/src/arrow/compute/exec/aggregate.cc +++ b/cpp/src/arrow/compute/exec/aggregate.cc @@ -18,14 +18,21 @@ #include "arrow/compute/exec/aggregate.h" #include +#include +#include #include "arrow/compute/exec_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" #include "arrow/util/task_group.h" namespace arrow { + +using internal::ToChars; + namespace compute { namespace internal { @@ -147,7 +154,7 @@ Result GroupBy(const std::vector& arguments, const std::vectoroutput_schema(); - auto exec_ctx = plan->exec_context(); + auto exec_ctx = plan->query_context()->exec_context(); std::vector kernels(aggregates.size()); std::vector>> states(kernels.size()); @@ -113,7 +114,7 @@ class ScalarAggregateNode : public ExecNode { } KernelContext kernel_ctx{exec_ctx}; - states[i].resize(plan->max_concurrency()); + states[i].resize(plan->query_context()->max_concurrency()); RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx, KernelInitArgs{kernels[i], { @@ -150,7 +151,7 @@ class ScalarAggregateNode : public ExecNode { {"function.options", aggs_[i].options ? aggs_[i].options->ToString() : ""}, {"function.kind", std::string(kind_name()) + "::Consume"}}); - KernelContext batch_ctx{plan()->exec_context()}; + KernelContext batch_ctx{plan()->query_context()->exec_context()}; batch_ctx.SetState(states_[i][thread_index].get()); ExecSpan single_column_batch{{batch.values[target_field_ids_[i]]}, batch.length}; @@ -168,7 +169,7 @@ class ScalarAggregateNode : public ExecNode { {"batch.length", batch.length}}); DCHECK_EQ(input, inputs_[0]); - auto thread_index = plan_->GetThreadIndex(); + auto thread_index = plan_->query_context()->GetThreadIndex(); if (ErrorIfNotOk(DoConsume(ExecSpan(batch), thread_index))) return; @@ -245,7 +246,7 @@ class ScalarAggregateNode : public ExecNode { {"function.options", aggs_[i].options ? aggs_[i].options->ToString() : ""}, {"function.kind", std::string(kind_name()) + "::Finalize"}}); - KernelContext ctx{plan()->exec_context()}; + KernelContext ctx{plan()->query_context()->exec_context()}; ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll( kernels_[i], &ctx, std::move(states_[i]))); RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i])); @@ -267,20 +268,19 @@ class ScalarAggregateNode : public ExecNode { class GroupByNode : public ExecNode { public: - GroupByNode(ExecNode* input, std::shared_ptr output_schema, ExecContext* ctx, + GroupByNode(ExecNode* input, std::shared_ptr output_schema, std::vector key_field_ids, std::vector agg_src_field_ids, std::vector aggs, std::vector agg_kernels) : ExecNode(input->plan(), {input}, {"groupby"}, std::move(output_schema), /*num_outputs=*/1), - ctx_(ctx), key_field_ids_(std::move(key_field_ids)), agg_src_field_ids_(std::move(agg_src_field_ids)), aggs_(std::move(aggs)), agg_kernels_(std::move(agg_kernels)) {} Status Init() override { - output_task_group_id_ = plan_->RegisterTaskGroup( + output_task_group_id_ = plan_->query_context()->RegisterTaskGroup( [this](size_t, int64_t task_id) { OutputNthBatch(task_id); return Status::OK(); @@ -326,7 +326,7 @@ class GroupByNode : public ExecNode { agg_src_types[i] = input_schema->field(agg_src_field_id)->type().get(); } - auto ctx = input->plan()->exec_context(); + auto ctx = plan->query_context()->exec_context(); // Construct aggregates ARROW_ASSIGN_OR_RAISE(auto agg_kernels, @@ -354,7 +354,7 @@ class GroupByNode : public ExecNode { } return input->plan()->EmplaceNode( - input, schema(std::move(output_fields)), ctx, std::move(key_field_ids), + input, schema(std::move(output_fields)), std::move(key_field_ids), std::move(agg_src_field_ids), std::move(aggs), std::move(agg_kernels)); } @@ -366,7 +366,7 @@ class GroupByNode : public ExecNode { {{"group_by", ToStringExtra()}, {"node.label", label()}, {"batch.length", batch.length}}); - size_t thread_index = plan_->GetThreadIndex(); + size_t thread_index = plan_->query_context()->GetThreadIndex(); if (thread_index >= local_states_.size()) { return Status::IndexError("thread index ", thread_index, " is out of range [0, ", local_states_.size(), ")"); @@ -393,7 +393,8 @@ class GroupByNode : public ExecNode { {"function.options", aggs_[i].options ? aggs_[i].options->ToString() : ""}, {"function.kind", std::string(kind_name()) + "::Consume"}}); - KernelContext kernel_ctx{ctx_}; + auto ctx = plan_->query_context()->exec_context(); + KernelContext kernel_ctx{ctx}; kernel_ctx.SetState(state->agg_states[i].get()); ExecSpan agg_batch({batch[agg_src_field_ids_[i]], ExecValue(*id_batch.array())}, @@ -429,7 +430,9 @@ class GroupByNode : public ExecNode { {"function.options", aggs_[i].options ? aggs_[i].options->ToString() : ""}, {"function.kind", std::string(kind_name()) + "::Merge"}}); - KernelContext batch_ctx{ctx_}; + + auto ctx = plan_->query_context()->exec_context(); + KernelContext batch_ctx{ctx}; DCHECK(state0->agg_states[i]); batch_ctx.SetState(state0->agg_states[i].get()); @@ -462,7 +465,7 @@ class GroupByNode : public ExecNode { {"function.options", aggs_[i].options ? aggs_[i].options->ToString() : ""}, {"function.kind", std::string(kind_name()) + "::Finalize"}}); - KernelContext batch_ctx{ctx_}; + KernelContext batch_ctx{plan_->query_context()->exec_context()}; batch_ctx.SetState(state->agg_states[i].get()); RETURN_NOT_OK(agg_kernels_[i]->finalize(&batch_ctx, &out_data.values[i])); state->agg_states[i].reset(); @@ -483,7 +486,7 @@ class GroupByNode : public ExecNode { outputs_[0]->InputReceived(this, out_data_.Slice(batch_size * n, batch_size)); } - Status OutputResult() { + Status DoOutputResult() { // To simplify merging, ensure that the first grouper is nonempty for (size_t i = 0; i < local_states_.size(); i++) { if (local_states_[i].grouper) { @@ -497,10 +500,28 @@ class GroupByNode : public ExecNode { int64_t num_output_batches = bit_util::CeilDiv(out_data_.length, output_batch_size()); outputs_[0]->InputFinished(this, static_cast(num_output_batches)); - RETURN_NOT_OK(plan_->StartTaskGroup(output_task_group_id_, num_output_batches)); + Status st = + plan_->query_context()->StartTaskGroup(output_task_group_id_, num_output_batches); + if (st.IsCancelled()) { + // This means the user has cancelled/aborted the plan. We will not send any batches + // and end immediately. + finished_.MarkFinished(); + return Status::OK(); + } else { + return st; + } return Status::OK(); } + void OutputResult() { + // If something goes wrong outputting the result we need to make sure + // we still mark finished. + Status st = DoOutputResult(); + if (!st.ok()) { + finished_.MarkFinished(st); + } + } + void InputReceived(ExecNode* input, ExecBatch batch) override { EVENT(span_, "InputReceived", {{"batch.length", batch.length}}); util::tracing::Span span; @@ -517,7 +538,7 @@ class GroupByNode : public ExecNode { if (ErrorIfNotOk(Consume(ExecSpan(batch)))) return; if (input_counter_.Increment()) { - ErrorIfNotOk(OutputResult()); + OutputResult(); } } @@ -538,7 +559,7 @@ class GroupByNode : public ExecNode { DCHECK_EQ(input, inputs_[0]); if (input_counter_.SetTotal(total_batches)) { - ErrorIfNotOk(OutputResult()); + OutputResult(); } } @@ -547,8 +568,7 @@ class GroupByNode : public ExecNode { {{"node.label", label()}, {"node.detail", ToString()}, {"node.kind", kind_name()}}); - - local_states_.resize(plan_->max_concurrency()); + local_states_.resize(plan_->query_context()->max_concurrency()); return Status::OK(); } @@ -566,7 +586,9 @@ class GroupByNode : public ExecNode { EVENT(span_, "StopProducing"); DCHECK_EQ(output, outputs_[0]); - if (input_counter_.Cancel()) finished_.MarkFinished(); + if (input_counter_.Cancel()) { + finished_.MarkFinished(); + } inputs_[0]->StopProducing(this); } @@ -593,7 +615,7 @@ class GroupByNode : public ExecNode { }; ThreadLocalState* GetLocalState() { - size_t thread_index = plan_->GetThreadIndex(); + size_t thread_index = plan_->query_context()->GetThreadIndex(); return &local_states_[thread_index]; } @@ -611,7 +633,8 @@ class GroupByNode : public ExecNode { } // Construct grouper - ARROW_ASSIGN_OR_RAISE(state->grouper, Grouper::Make(key_types, ctx_)); + ARROW_ASSIGN_OR_RAISE( + state->grouper, Grouper::Make(key_types, plan_->query_context()->exec_context())); // Build vector of aggregate source field data types std::vector agg_src_types(agg_kernels_.size()); @@ -620,21 +643,23 @@ class GroupByNode : public ExecNode { agg_src_types[i] = input_schema->field(agg_src_field_id)->type().get(); } - ARROW_ASSIGN_OR_RAISE(state->agg_states, internal::InitKernels(agg_kernels_, ctx_, - aggs_, agg_src_types)); + ARROW_ASSIGN_OR_RAISE( + state->agg_states, + internal::InitKernels(agg_kernels_, plan_->query_context()->exec_context(), aggs_, + agg_src_types)); return Status::OK(); } int output_batch_size() const { - int result = static_cast(ctx_->exec_chunksize()); + int result = + static_cast(plan_->query_context()->exec_context()->exec_chunksize()); if (result < 0) { result = 32 * 1024; } return result; } - ExecContext* ctx_; int output_task_group_id_; const std::vector key_field_ids_; diff --git a/cpp/src/arrow/compute/exec/asof_join_benchmark.cc b/cpp/src/arrow/compute/exec/asof_join_benchmark.cc index af471a50132..7ab4ee7a7ff 100644 --- a/cpp/src/arrow/compute/exec/asof_join_benchmark.cc +++ b/cpp/src/arrow/compute/exec/asof_join_benchmark.cc @@ -53,8 +53,8 @@ static void TableJoinOverhead(benchmark::State& state, TableGenerationProperties left_table_properties, TableGenerationProperties right_table_properties, int batch_size, int num_right_tables, - std::string factory_name, ExecNodeOptions& options) { - ExecContext ctx(default_memory_pool(), nullptr); + std::string factory_name, + std::shared_ptr options) { left_table_properties.column_prefix = "lt"; left_table_properties.seed = 0; ASSERT_OK_AND_ASSIGN(TableStats left_table_stats, MakeTable(left_table_properties)); @@ -75,23 +75,20 @@ static void TableJoinOverhead(benchmark::State& state, for (auto _ : state) { state.PauseTiming(); - ASSERT_OK_AND_ASSIGN(std::shared_ptr plan, - ExecPlan::Make(&ctx)); - std::vector input_nodes = {*arrow::compute::MakeExecNode( - "table_source", plan.get(), {}, + std::vector input_nodes = {Declaration( + "table_source", arrow::compute::TableSourceNodeOptions(left_table_stats.table, batch_size))}; input_nodes.reserve(right_input_tables.size() + 1); for (TableStats table_stats : right_input_tables) { - input_nodes.push_back(*arrow::compute::MakeExecNode( - "table_source", plan.get(), {}, + input_nodes.push_back(Declaration( + "table_source", arrow::compute::TableSourceNodeOptions(table_stats.table, batch_size))); } - ASSERT_OK_AND_ASSIGN(arrow::compute::ExecNode * join_node, - MakeExecNode(factory_name, plan.get(), input_nodes, options)); - AsyncGenerator> sink_gen; - MakeExecNode("sink", plan.get(), {join_node}, SinkNodeOptions{&sink_gen}); + Declaration join_node{factory_name, {input_nodes}, options}; state.ResumeTiming(); - ASSERT_FINISHES_OK(StartAndCollect(plan.get(), sink_gen)); + // asof-join must currently be run synchronously as it relies on data arriving + // in-order + ASSERT_OK(DeclarationToStatus(std::move(join_node), /*use_threads=*/false)); } state.counters["input_rows_per_second"] = benchmark::Counter( @@ -104,12 +101,22 @@ static void TableJoinOverhead(benchmark::State& state, benchmark::Counter::kIsRate); state.counters["maximum_peak_memory"] = - benchmark::Counter(static_cast(ctx.memory_pool()->max_memory())); + benchmark::Counter(static_cast(default_memory_pool()->max_memory())); +} + +AsofJoinNodeOptions GetRepeatedOptions(size_t repeat, FieldRef on_key, + std::vector by_key, int64_t tolerance) { + std::vector input_keys(repeat); + for (size_t i = 0; i < repeat; i++) { + input_keys[i] = {on_key, by_key}; + } + return AsofJoinNodeOptions(input_keys, tolerance); } static void AsOfJoinOverhead(benchmark::State& state) { int64_t tolerance = 0; - AsofJoinNodeOptions options = AsofJoinNodeOptions(kTimeCol, kKeyCol, tolerance); + auto options = std::make_shared( + GetRepeatedOptions(int(state.range(4) + 1), kTimeCol, {kKeyCol}, tolerance)); TableJoinOverhead( state, TableGenerationProperties{int(state.range(0)), int(state.range(1)), @@ -118,7 +125,7 @@ static void AsOfJoinOverhead(benchmark::State& state) { TableGenerationProperties{int(state.range(5)), int(state.range(6)), int(state.range(7)), "", kDefaultMinColumnVal, kDefaultMaxColumnVal, 0, kDefaultStart, kDefaultEnd}, - int(state.range(3)), int(state.range(4)), "asofjoin", options); + int(state.range(3)), int(state.range(4)), "asofjoin", std::move(options)); } // this generates the set of right hand tables to test on. diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc b/cpp/src/arrow/compute/exec/asof_join_node.cc index 3da612aa03e..a752cf800d9 100644 --- a/cpp/src/arrow/compute/exec/asof_join_node.cc +++ b/cpp/src/arrow/compute/exec/asof_join_node.cc @@ -15,36 +15,105 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/compute/exec/asof_join_node.h" + #include +#include +#include #include -#include +#include +#include #include #include +#include +#include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/key_hash.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/light_array.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" -#include "arrow/util/make_unique.h" -#include "arrow/util/optional.h" +#include "arrow/util/string.h" namespace arrow { + +using internal::ToChars; + namespace compute { -// Remove this when multiple keys and/or types is supported -typedef int32_t KeyType; +template +inline typename T::const_iterator std_find(const T& container, const V& val) { + return std::find(container.begin(), container.end(), val); +} + +template +inline bool std_has(const T& container, const V& val) { + return container.end() != std_find(container, val); +} + +typedef uint64_t ByType; +typedef uint64_t OnType; +typedef uint64_t HashType; + +/// A tolerance type with overflow-avoiding operations +struct TolType { + constexpr static OnType kMinValue = std::numeric_limits::lowest(); + constexpr static OnType kMaxValue = std::numeric_limits::max(); + + explicit TolType(int64_t tol) + : value(static_cast(tol < 0 ? -tol : tol)), negative(tol < 0) {} + + OnType value; + bool negative; + + // an entry with a time below this threshold expires + inline OnType Expiry(OnType left_value) { + return negative ? left_value + : (left_value < kMinValue + value ? kMinValue : left_value - value); + } + + // an entry with a time after this threshold is distant + inline OnType Horizon(OnType left_value) { + return negative ? (left_value > kMaxValue - value ? kMaxValue : left_value + value) + : left_value; + } + + // true when the tolerance accepts the RHS time given the LHS one + inline bool Accepts(OnType left_value, OnType right_value) { + return negative + ? (left_value > right_value ? false : right_value - left_value <= value) + : (left_value < right_value ? false : left_value - right_value <= value); + } +}; // Maximum number of tables that can be joined #define MAX_JOIN_TABLES 64 typedef uint64_t row_index_t; typedef int col_index_t; +// normalize the value to 64-bits while preserving ordering of values +template ::value, bool> = true> +static inline uint64_t time_value(T t) { + uint64_t bias = std::is_signed::value ? (uint64_t)1 << (8 * sizeof(T) - 1) : 0; + return t < 0 ? static_cast(t + bias) : static_cast(t); +} + +// indicates normalization of a key value +template ::value, bool> = true> +static inline uint64_t key_value(T t) { + return static_cast(t); +} + /** * Simple implementation for an unbound concurrent queue */ @@ -65,11 +134,16 @@ class ConcurrentQueue { cond_.notify_one(); } - util::optional TryPop() { + void Clear() { + std::unique_lock lock(mutex_); + queue_ = std::queue(); + } + + std::optional TryPop() { // Try to pop the oldest value from the queue (or return nullopt if none) std::unique_lock lock(mutex_); if (queue_.empty()) { - return util::nullopt; + return std::nullopt; } else { auto item = queue_.front(); queue_.pop(); @@ -98,8 +172,19 @@ struct MemoStore { // Stores last known values for all the keys struct Entry { + Entry() = default; + + Entry(OnType time, std::shared_ptr batch, row_index_t row) + : time(time), batch(batch), row(row) {} + + void swap(Entry& other) { + std::swap(time, other.time); + std::swap(batch, other.batch); + std::swap(row, other.row); + } + // Timestamp associated with the entry - int64_t time; + OnType time; // Batch associated with the entry (perf is probably OK for this; batches change // rarely) @@ -109,46 +194,191 @@ struct MemoStore { row_index_t row; }; - std::unordered_map entries_; + explicit MemoStore(bool no_future) + : no_future_(no_future), current_time_(std::numeric_limits::lowest()) {} + + // true when there are no future entries, which is the case for the LHS table and the + // case for when the tolerance is positive. A regular non-negative-tolerance as-of-join + // operation requires memorizing only the most recently observed entry per key. OTOH, a + // negative-tolerance (future) as-of-join operation requires memorizing per-key queues + // of entries up to the tolerance's horizon and in particular distinguishes between the + // current (front-of-queue) and latest (back-of-queue) entries per key. + bool no_future_; + // the time of the current entry, defaulting to 0. + // when entries with a time less than T are removed, the current time is updated to the + // time of the next (by-time) and now-current entry or to T if no such entry exists. + OnType current_time_; + // current entry per key + std::unordered_map entries_; + // future entries per key + std::unordered_map> future_entries_; + // current and future (distinct) times of existing entries + std::deque times_; + + void swap(MemoStore& memo) { + std::swap(no_future_, memo.no_future_); + std::swap(current_time_, memo.current_time_); + entries_.swap(memo.entries_); + future_entries_.swap(memo.future_entries_); + times_.swap(memo.times_); + } - void Store(const std::shared_ptr& batch, row_index_t row, int64_t time, - KeyType key) { - auto& e = entries_[key]; - // that we can do this assignment optionally, is why we - // can get array with using shared_ptr above (the batch - // shouldn't change that often) - if (e.batch != batch) e.batch = batch; - e.row = row; - e.time = time; + void Store(OnType for_time, const std::shared_ptr& batch, row_index_t row, + OnType time, ByType key) { + if (no_future_ || entries_.count(key) == 0) { + auto& e = entries_[key]; + // that we can do this assignment optionally, is why we + // can get away with using shared_ptr above (the batch + // shouldn't change that often) + if (e.batch != batch) e.batch = batch; + e.row = row; + e.time = time; + } else { + future_entries_[key].emplace(time, batch, row); + } + if (!no_future_ || times_.empty() || times_.front() != time) { + times_.push_back(time); + } else { + times_.front() = time; + } } - util::optional GetEntryForKey(KeyType key) const { + std::optional GetEntryForKey(ByType key) const { auto e = entries_.find(key); - if (entries_.end() == e) return util::nullopt; - return util::optional(&e->second); + return entries_.end() == e ? std::nullopt : std::optional(&e->second); } - void RemoveEntriesWithLesserTime(int64_t ts) { - for (auto e = entries_.begin(); e != entries_.end();) - if (e->second.time < ts) - e = entries_.erase(e); - else + bool RemoveEntriesWithLesserTime(OnType ts) { + for (auto fe = future_entries_.begin(); fe != future_entries_.end();) { + auto& queue = fe->second; + while (!queue.empty() && queue.front().time < ts) queue.pop(); + if (queue.empty()) { + fe = future_entries_.erase(fe); + } else { + ++fe; + } + } + for (auto e = entries_.begin(); e != entries_.end();) { + if (e->second.time < ts) { + auto fe = future_entries_.find(e->first); + if (fe != future_entries_.end() && !fe->second.empty()) { + auto& queue = fe->second; + e->second.swap(queue.front()); + queue.pop(); + ++e; + } else { + e = entries_.erase(e); + } + } else { ++e; + } + } + bool updated = false; + while (!times_.empty() && times_.front() < ts) { + current_time_ = times_.front(); + times_.pop_front(); + updated = true; + } + for (auto times_it = times_.begin(); times_it != times_.end(); times_it++) { + if (current_time_ < *times_it) { + current_time_ = *times_it; + updated = true; + } + if (*times_it > ts) break; + } + if (current_time_ < ts) { + current_time_ = ts; + updated = true; + } + return updated; } }; +// a specialized higher-performance variation of Hashing64 logic from hash_join_node +// the code here avoids recreating objects that are independent of each batch processed +class KeyHasher { + static constexpr int kMiniBatchLength = util::MiniBatch::kMiniBatchLength; + + public: + explicit KeyHasher(const std::vector& indices) + : indices_(indices), + metadata_(indices.size()), + batch_(NULLPTR), + hashes_(), + ctx_(), + column_arrays_(), + stack_() { + ctx_.stack = &stack_; + column_arrays_.resize(indices.size()); + } + + Status Init(ExecContext* exec_context, const std::shared_ptr& schema) { + ctx_.hardware_flags = exec_context->cpu_info()->hardware_flags(); + const auto& fields = schema->fields(); + for (size_t k = 0; k < metadata_.size(); k++) { + ARROW_ASSIGN_OR_RAISE(metadata_[k], + ColumnMetadataFromDataType(fields[indices_[k]]->type())); + } + return stack_.Init(exec_context->memory_pool(), + 4 * kMiniBatchLength * sizeof(uint32_t)); + } + + const std::vector& HashesFor(const RecordBatch* batch) { + if (batch_ == batch) { + return hashes_; + } + batch_ = NULLPTR; // invalidate cached hashes for batch + size_t batch_length = batch->num_rows(); + hashes_.resize(batch_length); + for (int64_t i = 0; i < static_cast(batch_length); i += kMiniBatchLength) { + int64_t length = std::min(static_cast(batch_length - i), + static_cast(kMiniBatchLength)); + for (size_t k = 0; k < indices_.size(); k++) { + auto array_data = batch->column_data(indices_[k]); + column_arrays_[k] = + ColumnArrayFromArrayDataAndMetadata(array_data, metadata_[k], i, length); + } + Hashing64::HashMultiColumn(column_arrays_, &ctx_, hashes_.data() + i); + } + batch_ = batch; + return hashes_; + } + + private: + std::vector indices_; + std::vector metadata_; + const RecordBatch* batch_; + std::vector hashes_; + LightContext ctx_; + std::vector column_arrays_; + util::TempVectorStack stack_; +}; + class InputState { // InputState correponds to an input // Input record batches are queued up in InputState until processed and // turned into output record batches. public: - InputState(const std::shared_ptr& schema, - const std::string& time_col_name, const std::string& key_col_name) + InputState(size_t index, TolType tolerance, bool must_hash, bool may_rehash, + KeyHasher* key_hasher, const std::shared_ptr& schema, + const col_index_t time_col_index, + const std::vector& key_col_index) : queue_(), schema_(schema), - time_col_index_(schema->GetFieldIndex(time_col_name)), - key_col_index_(schema->GetFieldIndex(key_col_name)) {} + time_col_index_(time_col_index), + key_col_index_(key_col_index), + time_type_id_(schema_->fields()[time_col_index_]->type()->id()), + key_type_id_(key_col_index.size()), + key_hasher_(key_hasher), + must_hash_(must_hash), + may_rehash_(may_rehash), + tolerance_(tolerance), + memo_(/*no_future=*/index == 0 || !tolerance.negative) { + for (size_t k = 0; k < key_col_index_.size(); k++) { + key_type_id_[k] = schema_->fields()[key_col_index_[k]]->type()->id(); + } + } col_index_t InitSrcToDstMapping(col_index_t dst_offset, bool skip_time_and_key_fields) { src_to_dst_.resize(schema_->num_fields()); @@ -158,13 +388,13 @@ class InputState { return dst_offset; } - const util::optional& MapSrcToDst(col_index_t src) const { + const std::optional& MapSrcToDst(col_index_t src) const { return src_to_dst_[src]; } bool IsTimeOrKeyColumn(col_index_t i) const { DCHECK_LT(i, schema_->num_fields()); - return (i == time_col_index_) || (i == key_col_index_); + return (i == time_col_index_) || std_has(key_col_index_, i); } // Gets the latest row index, assuming the queue isn't empty @@ -177,6 +407,20 @@ class InputState { return queue_.Empty(); } + // true when the queue is empty and, when memo may have future entries (the case of a + // negative tolerance), when the memo is empty. + // used when checking whether RHS is up to date with LHS. + bool CurrentEmpty() const { + return memo_.no_future_ ? Empty() : memo_.times_.empty() && Empty(); + } + + // in case memo may not have future entries (the case of a non-negative tolerance), + // returns the latest time (which is current); otherwise, returns the current time. + // used when checking whether RHS is up to date with LHS. + OnType GetCurrentTime() const { + return memo_.no_future_ ? GetLatestTime() : memo_.current_time_; + } + int total_batches() const { return total_batches_; } // Gets latest batch (precondition: must not be empty) @@ -184,27 +428,87 @@ class InputState { return queue_.UnsyncFront(); } - KeyType GetLatestKey() const { - return queue_.UnsyncFront() - ->column_data(key_col_index_) - ->GetValues(1)[latest_ref_row_]; +#define LATEST_VAL_CASE(id, val) \ + case Type::id: { \ + using T = typename TypeIdTraits::Type; \ + using CType = typename TypeTraits::CType; \ + return val(data->GetValues(1)[row]); \ } - int64_t GetLatestTime() const { - return queue_.UnsyncFront() - ->column_data(time_col_index_) - ->GetValues(1)[latest_ref_row_]; + inline ByType GetLatestKey() const { + return GetKey(GetLatestBatch().get(), latest_ref_row_); } + inline ByType GetKey(const RecordBatch* batch, row_index_t row) const { + if (must_hash_) { + return key_hasher_->HashesFor(batch)[row]; + } + if (key_col_index_.size() == 0) { + return 0; + } + auto data = batch->column_data(key_col_index_[0]); + switch (key_type_id_[0]) { + LATEST_VAL_CASE(INT8, key_value) + LATEST_VAL_CASE(INT16, key_value) + LATEST_VAL_CASE(INT32, key_value) + LATEST_VAL_CASE(INT64, key_value) + LATEST_VAL_CASE(UINT8, key_value) + LATEST_VAL_CASE(UINT16, key_value) + LATEST_VAL_CASE(UINT32, key_value) + LATEST_VAL_CASE(UINT64, key_value) + LATEST_VAL_CASE(DATE32, key_value) + LATEST_VAL_CASE(DATE64, key_value) + LATEST_VAL_CASE(TIME32, key_value) + LATEST_VAL_CASE(TIME64, key_value) + LATEST_VAL_CASE(TIMESTAMP, key_value) + default: + DCHECK(false); + return 0; // cannot happen + } + } + + inline OnType GetLatestTime() const { + return GetTime(GetLatestBatch().get(), latest_ref_row_); + } + + inline ByType GetTime(const RecordBatch* batch, row_index_t row) const { + auto data = batch->column_data(time_col_index_); + switch (time_type_id_) { + LATEST_VAL_CASE(INT8, time_value) + LATEST_VAL_CASE(INT16, time_value) + LATEST_VAL_CASE(INT32, time_value) + LATEST_VAL_CASE(INT64, time_value) + LATEST_VAL_CASE(UINT8, time_value) + LATEST_VAL_CASE(UINT16, time_value) + LATEST_VAL_CASE(UINT32, time_value) + LATEST_VAL_CASE(UINT64, time_value) + LATEST_VAL_CASE(DATE32, time_value) + LATEST_VAL_CASE(DATE64, time_value) + LATEST_VAL_CASE(TIME32, time_value) + LATEST_VAL_CASE(TIME64, time_value) + LATEST_VAL_CASE(TIMESTAMP, time_value) + default: + DCHECK(false); + return 0; // cannot happen + } + } + +#undef LATEST_VAL_CASE + bool Finished() const { return batches_processed_ == total_batches_; } - bool Advance() { + Result Advance() { // Try advancing to the next row and update latest_ref_row_ // Returns true if able to advance, false if not. bool have_active_batch = (latest_ref_row_ > 0 /*short circuit the lock on the queue*/) || !queue_.Empty(); if (have_active_batch) { + OnType next_time = GetLatestTime(); + if (latest_time_ > next_time) { + return Status::Invalid("AsofJoin does not allow out-of-order on-key values"); + } + latest_time_ = next_time; // If we have an active batch if (++latest_ref_row_ >= (row_index_t)queue_.UnsyncFront()->num_rows()) { // hit the end of the batch, need to get the next batch if possible. @@ -218,59 +522,85 @@ class InputState { return have_active_batch; } - // Advance the data to be immediately past the specified timestamp, update - // latest_time and latest_ref_row to the value that immediately pass the - // specified timestamp. + // Advance the data to be immediately past the tolerance's horizon for the specified + // timestamp, update latest_time and latest_ref_row to the value that immediately pass + // the horizon. Update the memo-store with any entries or future entries so observed. // Returns true if updates were made, false if not. - bool AdvanceAndMemoize(int64_t ts) { + Result AdvanceAndMemoize(OnType ts) { // Advance the right side row index until we reach the latest right row (for each key) // for the given left timestamp. // Check if already updated for TS (or if there is no latest) - if (Empty()) return false; // can't advance if empty - auto latest_time = GetLatestTime(); - if (latest_time > ts) return false; // already advanced + if (Empty()) { // can't advance if empty and no future entries + return memo_.no_future_ ? false : memo_.RemoveEntriesWithLesserTime(ts); + } // Not updated. Try to update and possibly advance. - bool updated = false; + bool advanced, updated = false; do { - latest_time = GetLatestTime(); + auto latest_time = GetLatestTime(); // if Advance() returns true, then the latest_ts must also be valid // Keep advancing right table until we hit the latest row that has // timestamp <= ts. This is because we only need the latest row for the // match given a left ts. - if (latest_time <= ts) { - memo_.Store(GetLatestBatch(), latest_ref_row_, latest_time, GetLatestKey()); - } else { - break; // hit a future timestamp -- done updating for now + if (latest_time > tolerance_.Horizon(ts)) { // hit a distant timestamp + if (memo_.no_future_ || !memo_.times_.empty()) break; // no future entries } - updated = true; - } while (Advance()); + auto rb = GetLatestBatch(); + if (may_rehash_ && rb->column_data(key_col_index_[0])->GetNullCount() > 0) { + must_hash_ = true; + may_rehash_ = false; + Rehash(); + } + memo_.Store(ts, rb, latest_ref_row_, latest_time, GetLatestKey()); + updated = memo_.no_future_; + ARROW_ASSIGN_OR_RAISE(advanced, Advance()); + } while (advanced); + if (!memo_.no_future_) { // "updated" was not modified in the loop; set it here + updated = memo_.RemoveEntriesWithLesserTime(ts); + } return updated; } - void Push(const std::shared_ptr& rb) { + void Rehash() { + MemoStore new_memo(memo_.no_future_); + new_memo.current_time_ = memo_.current_time_; + for (auto e = memo_.entries_.begin(); e != memo_.entries_.end(); ++e) { + auto& entry = e->second; + auto new_key = GetKey(entry.batch.get(), entry.row); + new_memo.entries_[new_key].swap(entry); + auto fe = memo_.future_entries_.find(e->first); + if (fe != memo_.future_entries_.end()) { + new_memo.future_entries_[new_key].swap(fe->second); + } + } + memo_.times_.swap(new_memo.times_); + memo_.swap(new_memo); + } + + Status Push(const std::shared_ptr& rb) { if (rb->num_rows() > 0) { queue_.Push(rb); } else { ++batches_processed_; // don't enqueue empty batches, just record as processed } + return Status::OK(); } - util::optional GetMemoEntryForKey(KeyType key) { + std::optional GetMemoEntryForKey(ByType key) { return memo_.GetEntryForKey(key); } - util::optional GetMemoTimeForKey(KeyType key) { + std::optional GetMemoTimeForKey(ByType key) { auto r = GetMemoEntryForKey(key); if (r.has_value()) { return (*r)->time; } else { - return util::nullopt; + return std::nullopt; } } - void RemoveMemoEntriesWithLesserTime(int64_t ts) { + void RemoveMemoEntriesWithLesserTime(OnType ts) { memo_.RemoveEntriesWithLesserTime(ts); } @@ -294,14 +624,28 @@ class InputState { // Index of the time col col_index_t time_col_index_; // Index of the key col - col_index_t key_col_index_; + std::vector key_col_index_; + // Type id of the time column + Type::type time_type_id_; + // Type id of the key column + std::vector key_type_id_; + // Hasher for key elements + mutable KeyHasher* key_hasher_; + // True if hashing is mandatory + bool must_hash_; + // True if by-key values may be rehashed + bool may_rehash_; + // Tolerance + TolType tolerance_; // Index of the latest row reference within; if >0 then queue_ cannot be empty // Must be < queue_.front()->num_rows() if queue_ is non-empty row_index_t latest_ref_row_ = 0; + // Time of latest row + OnType latest_time_ = std::numeric_limits::lowest(); // Stores latest known values for the various keys MemoStore memo_; // Mapping of source columns to destination columns - std::vector> src_to_dst_; + std::vector> src_to_dst_; }; template @@ -336,18 +680,18 @@ class CompositeReferenceTable { // Adds the latest row from the input state as a new composite reference row // - LHS must have a valid key,timestep,and latest rows // - RHS must have valid data memo'ed for the key - void Emplace(std::vector>& in, int64_t tolerance) { + void Emplace(std::vector>& in, TolType tolerance) { DCHECK_EQ(in.size(), n_tables_); // Get the LHS key - KeyType key = in[0]->GetLatestKey(); + ByType key = in[0]->GetLatestKey(); // Add row and setup LHS // (the LHS state comes just from the latest row of the LHS table) DCHECK(!in[0]->Empty()); const std::shared_ptr& lhs_latest_batch = in[0]->GetLatestBatch(); row_index_t lhs_latest_row = in[0]->GetLatestRow(); - int64_t lhs_latest_time = in[0]->GetLatestTime(); + OnType lhs_latest_time = in[0]->GetLatestTime(); if (0 == lhs_latest_row) { // On the first row of the batch, we resize the destination. // The destination size is dictated by the size of the LHS batch. @@ -364,10 +708,10 @@ class CompositeReferenceTable { // Get the state for that key from all on the RHS -- assumes it's up to date // (the RHS state comes from the memoized row references) for (size_t i = 1; i < in.size(); ++i) { - util::optional opt_entry = in[i]->GetMemoEntryForKey(key); + std::optional opt_entry = in[i]->GetMemoEntryForKey(key); if (opt_entry.has_value()) { DCHECK(*opt_entry); - if ((*opt_entry)->time + tolerance >= lhs_latest_time) { + if (tolerance.Accepts(lhs_latest_time, (*opt_entry)->time)) { // Have a valid entry const MemoStore::Entry* entry = *opt_entry; row.refs[i].batch = entry->batch.get(); @@ -397,7 +741,7 @@ class CompositeReferenceTable { int n_src_cols = state.at(i_table)->get_schema()->num_fields(); { for (col_index_t i_src_col = 0; i_src_col < n_src_cols; ++i_src_col) { - util::optional i_dst_col_opt = + std::optional i_dst_col_opt = state[i_table]->MapSrcToDst(i_src_col); if (!i_dst_col_opt) continue; col_index_t i_dst_col = *i_dst_col_opt; @@ -407,29 +751,43 @@ class CompositeReferenceTable { DCHECK_EQ(src_field->name(), dst_field->name()); const auto& field_type = src_field->type(); - if (field_type->Equals(arrow::int32())) { - ARROW_ASSIGN_OR_RAISE( - arrays.at(i_dst_col), - (MaterializePrimitiveColumn( - memory_pool, i_table, i_src_col))); - } else if (field_type->Equals(arrow::int64())) { - ARROW_ASSIGN_OR_RAISE( - arrays.at(i_dst_col), - (MaterializePrimitiveColumn( - memory_pool, i_table, i_src_col))); - } else if (field_type->Equals(arrow::float32())) { - ARROW_ASSIGN_OR_RAISE(arrays.at(i_dst_col), - (MaterializePrimitiveColumn( - memory_pool, i_table, i_src_col))); - } else if (field_type->Equals(arrow::float64())) { - ARROW_ASSIGN_OR_RAISE( - arrays.at(i_dst_col), - (MaterializePrimitiveColumn( - memory_pool, i_table, i_src_col))); - } else { - ARROW_RETURN_NOT_OK( - Status::Invalid("Unsupported data type: ", src_field->name())); +#define ASOFJOIN_MATERIALIZE_CASE(id) \ + case Type::id: { \ + using T = typename TypeIdTraits::Type; \ + ARROW_ASSIGN_OR_RAISE( \ + arrays.at(i_dst_col), \ + MaterializeColumn(memory_pool, field_type, i_table, i_src_col)); \ + break; \ + } + + switch (field_type->id()) { + ASOFJOIN_MATERIALIZE_CASE(BOOL) + ASOFJOIN_MATERIALIZE_CASE(INT8) + ASOFJOIN_MATERIALIZE_CASE(INT16) + ASOFJOIN_MATERIALIZE_CASE(INT32) + ASOFJOIN_MATERIALIZE_CASE(INT64) + ASOFJOIN_MATERIALIZE_CASE(UINT8) + ASOFJOIN_MATERIALIZE_CASE(UINT16) + ASOFJOIN_MATERIALIZE_CASE(UINT32) + ASOFJOIN_MATERIALIZE_CASE(UINT64) + ASOFJOIN_MATERIALIZE_CASE(FLOAT) + ASOFJOIN_MATERIALIZE_CASE(DOUBLE) + ASOFJOIN_MATERIALIZE_CASE(DATE32) + ASOFJOIN_MATERIALIZE_CASE(DATE64) + ASOFJOIN_MATERIALIZE_CASE(TIME32) + ASOFJOIN_MATERIALIZE_CASE(TIME64) + ASOFJOIN_MATERIALIZE_CASE(TIMESTAMP) + ASOFJOIN_MATERIALIZE_CASE(STRING) + ASOFJOIN_MATERIALIZE_CASE(LARGE_STRING) + ASOFJOIN_MATERIALIZE_CASE(BINARY) + ASOFJOIN_MATERIALIZE_CASE(LARGE_BINARY) + default: + return Status::Invalid("Unsupported data type ", + src_field->type()->ToString(), " for field ", + src_field->name()); } + +#undef ASOFJOIN_MATERIALIZE_CASE } } } @@ -458,18 +816,50 @@ class CompositeReferenceTable { void AddRecordBatchRef(const std::shared_ptr& ref) { if (!_ptr2ref.count((uintptr_t)ref.get())) _ptr2ref[(uintptr_t)ref.get()] = ref; } + template ::BuilderType> + enable_if_fixed_width_type static BuilderAppend( + Builder& builder, const std::shared_ptr& source, row_index_t row) { + if (source->IsNull(row)) { + builder.UnsafeAppendNull(); + return Status::OK(); + } + + if constexpr (is_boolean_type::value) { + builder.UnsafeAppend(bit_util::GetBit(source->template GetValues(1), row)); + } else { + using CType = typename TypeTraits::CType; + builder.UnsafeAppend(source->template GetValues(1)[row]); + } + return Status::OK(); + } + + template ::BuilderType> + enable_if_base_binary static BuilderAppend( + Builder& builder, const std::shared_ptr& source, row_index_t row) { + if (source->IsNull(row)) { + return builder.AppendNull(); + } + using offset_type = typename Type::offset_type; + const uint8_t* data = source->buffers[2]->data(); + const offset_type* offsets = source->GetValues(1); + const offset_type offset0 = offsets[row]; + const offset_type offset1 = offsets[row + 1]; + return builder.Append(data + offset0, offset1 - offset0); + } - template - Result> MaterializePrimitiveColumn(MemoryPool* memory_pool, - size_t i_table, - col_index_t i_col) { - Builder builder(memory_pool); + template ::BuilderType> + Result> MaterializeColumn(MemoryPool* memory_pool, + const std::shared_ptr& type, + size_t i_table, col_index_t i_col) { + ARROW_ASSIGN_OR_RAISE(auto a_builder, MakeBuilder(type, memory_pool)); + Builder& builder = *checked_cast(a_builder.get()); ARROW_RETURN_NOT_OK(builder.Reserve(rows_.size())); for (row_index_t i_row = 0; i_row < rows_.size(); ++i_row) { const auto& ref = rows_[i_row].refs[i_table]; if (ref.batch) { - builder.UnsafeAppend( - ref.batch->column_data(i_col)->template GetValues(1)[ref.row]); + Status st = + BuilderAppend(builder, ref.batch->column_data(i_col), ref.row); + ARROW_RETURN_NOT_OK(st); } else { builder.UnsafeAppendNull(); } @@ -480,14 +870,21 @@ class CompositeReferenceTable { } }; +// TODO: Currently, AsofJoinNode uses 64-bit hashing which leads to a non-negligible +// probability of collision, which can cause incorrect results when many different by-key +// values are processed. Thus, AsofJoinNode is currently limited to about 100k by-keys for +// guaranteeing this probability is below 1 in a billion. The fix is 128-bit hashing. +// See ARROW-17653 class AsofJoinNode : public ExecNode { // Advances the RHS as far as possible to be up to date for the current LHS timestamp - bool UpdateRhs() { + Result UpdateRhs() { auto& lhs = *state_.at(0); auto lhs_latest_time = lhs.GetLatestTime(); bool any_updated = false; - for (size_t i = 1; i < state_.size(); ++i) - any_updated |= state_[i]->AdvanceAndMemoize(lhs_latest_time); + for (size_t i = 1; i < state_.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(bool advanced, state_[i]->AdvanceAndMemoize(lhs_latest_time)); + any_updated |= advanced; + } return any_updated; } @@ -495,14 +892,14 @@ class AsofJoinNode : public ExecNode { bool IsUpToDateWithLhsRow() const { auto& lhs = *state_[0]; if (lhs.Empty()) return false; // can't proceed if nothing on the LHS - int64_t lhs_ts = lhs.GetLatestTime(); + OnType lhs_ts = lhs.GetLatestTime(); for (size_t i = 1; i < state_.size(); ++i) { auto& rhs = *state_[i]; if (!rhs.Finished()) { // If RHS is finished, then we know it's up to date - if (rhs.Empty()) + if (rhs.CurrentEmpty()) return false; // RHS isn't finished, but is empty --> not up to date - if (lhs_ts >= rhs.GetLatestTime()) + if (lhs_ts >= rhs.GetCurrentTime()) return false; // RHS isn't up to date (and not finished) } } @@ -523,7 +920,7 @@ class AsofJoinNode : public ExecNode { if (lhs.Finished() || lhs.Empty()) break; // Advance each of the RHS as far as possible to be up to date for the LHS timestamp - bool any_rhs_advanced = UpdateRhs(); + ARROW_ASSIGN_OR_RAISE(bool any_rhs_advanced, UpdateRhs()); // If we have received enough inputs to produce the next output batch // (decided by IsUpToDateWithLhsRow), we will perform the join and @@ -531,8 +928,9 @@ class AsofJoinNode : public ExecNode { // the LHS and adding joined row to rows_ (done by Emplace). Finally, // input batches that are no longer needed are removed to free up memory. if (IsUpToDateWithLhsRow()) { - dst.Emplace(state_, options_.tolerance); - if (!lhs.Advance()) break; // if we can't advance LHS, we're done for this batch + dst.Emplace(state_, tolerance_); + ARROW_ASSIGN_OR_RAISE(bool advanced, lhs.Advance()); + if (!advanced) break; // if we can't advance LHS, we're done for this batch } else { if (!any_rhs_advanced) break; // need to wait for new data } @@ -541,8 +939,10 @@ class AsofJoinNode : public ExecNode { // Prune memo entries that have expired (to bound memory consumption) if (!lhs.Empty()) { for (size_t i = 1; i < state_.size(); ++i) { - state_[i]->RemoveMemoEntriesWithLesserTime(lhs.GetLatestTime() - - options_.tolerance); + OnType ts = tolerance_.Expiry(lhs.GetLatestTime()); + if (ts != TolType::kMinValue) { + state_[i]->RemoveMemoEntriesWithLesserTime(ts); + } } } @@ -550,15 +950,41 @@ class AsofJoinNode : public ExecNode { if (dst.empty()) { return NULLPTR; } else { - return dst.Materialize(plan()->exec_context()->memory_pool(), output_schema(), + return dst.Materialize(plan()->query_context()->memory_pool(), output_schema(), state_); } } - void Process() { + template + struct Defer { + Callable callable; + explicit Defer(Callable callable) : callable(std::move(callable)) {} + ~Defer() noexcept { callable(); } + }; + + void EndFromProcessThread() { + // We must spawn a new task to transfer off the process thread when + // marking this finished. Otherwise there is a chance that doing so could + // mark the plan finished which may destroy the plan which will destroy this + // node which will cause us to join on ourselves. + ErrorIfNotOk(plan_->query_context()->executor()->Spawn([this] { + Defer cleanup([this]() { finished_.MarkFinished(); }); + outputs_[0]->InputFinished(this, batches_produced_); + })); + } + + bool CheckEnded() { + if (state_.at(0)->Finished()) { + EndFromProcessThread(); + return false; + } + return true; + } + + bool Process() { std::lock_guard guard(gate_); - if (finished_.is_finished()) { - return; + if (!CheckEnded()) { + return false; } // Process batches while we have data @@ -572,9 +998,9 @@ class AsofJoinNode : public ExecNode { ExecBatch out_b(*out_rb); outputs_[0]->InputReceived(this, std::move(out_b)); } else { - StopProducing(); ErrorIfNotOk(result.status()); - return; + EndFromProcessThread(); + return false; } } @@ -583,18 +1009,24 @@ class AsofJoinNode : public ExecNode { // // It may happen here in cases where InputFinished was called before we were finished // producing results (so we didn't know the output size at that time) - if (state_.at(0)->Finished()) { - StopProducing(); - outputs_[0]->InputFinished(this, batches_produced_); + if (!CheckEnded()) { + return false; } + + // There is no more we can do now but there is still work remaining for later when + // more data arrives. + return true; } void ProcessThread() { for (;;) { if (!process_.Pop()) { + EndFromProcessThread(); + return; + } + if (!Process()) { return; } - Process(); } } @@ -602,54 +1034,173 @@ class AsofJoinNode : public ExecNode { public: AsofJoinNode(ExecPlan* plan, NodeVector inputs, std::vector input_labels, - const AsofJoinNodeOptions& join_options, - std::shared_ptr output_schema); + const std::vector& indices_of_on_key, + const std::vector>& indices_of_by_key, + TolType tolerance, std::shared_ptr output_schema, + std::vector> key_hashers, bool must_hash, + bool may_rehash); + + Status Init() override { + auto inputs = this->inputs(); + for (size_t i = 0; i < inputs.size(); i++) { + RETURN_NOT_OK(key_hashers_[i]->Init(plan()->query_context()->exec_context(), + output_schema())); + state_.push_back(std::make_unique( + i, tolerance_, must_hash_, may_rehash_, key_hashers_[i].get(), + inputs[i]->output_schema(), indices_of_on_key_[i], indices_of_by_key_[i])); + } + + col_index_t dst_offset = 0; + for (auto& state : state_) + dst_offset = state->InitSrcToDstMapping(dst_offset, !!dst_offset); + + return Status::OK(); + } virtual ~AsofJoinNode() { process_.Push(false); // poison pill process_thread_.join(); } + const std::vector& indices_of_on_key() { return indices_of_on_key_; } + const std::vector>& indices_of_by_key() { + return indices_of_by_key_; + } + + static Status is_valid_on_field(const std::shared_ptr& field) { + switch (field->type()->id()) { + case Type::INT8: + case Type::INT16: + case Type::INT32: + case Type::INT64: + case Type::UINT8: + case Type::UINT16: + case Type::UINT32: + case Type::UINT64: + case Type::DATE32: + case Type::DATE64: + case Type::TIME32: + case Type::TIME64: + case Type::TIMESTAMP: + return Status::OK(); + default: + return Status::Invalid("Unsupported type for on-key ", field->name(), " : ", + field->type()->ToString()); + } + } + + static Status is_valid_by_field(const std::shared_ptr& field) { + switch (field->type()->id()) { + case Type::INT8: + case Type::INT16: + case Type::INT32: + case Type::INT64: + case Type::UINT8: + case Type::UINT16: + case Type::UINT32: + case Type::UINT64: + case Type::DATE32: + case Type::DATE64: + case Type::TIME32: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::STRING: + case Type::LARGE_STRING: + case Type::BINARY: + case Type::LARGE_BINARY: + return Status::OK(); + default: + return Status::Invalid("Unsupported type for by-key ", field->name(), " : ", + field->type()->ToString()); + } + } + + static Status is_valid_data_field(const std::shared_ptr& field) { + switch (field->type()->id()) { + case Type::BOOL: + case Type::INT8: + case Type::INT16: + case Type::INT32: + case Type::INT64: + case Type::UINT8: + case Type::UINT16: + case Type::UINT32: + case Type::UINT64: + case Type::FLOAT: + case Type::DOUBLE: + case Type::DATE32: + case Type::DATE64: + case Type::TIME32: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::STRING: + case Type::LARGE_STRING: + case Type::BINARY: + case Type::LARGE_BINARY: + return Status::OK(); + default: + return Status::Invalid("Unsupported type for data field ", field->name(), " : ", + field->type()->ToString()); + } + } + static arrow::Result> MakeOutputSchema( - const std::vector& inputs, const AsofJoinNodeOptions& options) { + const std::vector> input_schema, + const std::vector& indices_of_on_key, + const std::vector>& indices_of_by_key) { std::vector> fields; - const auto& on_field_name = *options.on_key.name(); - const auto& by_field_name = *options.by_key.name(); - + size_t n_by = indices_of_by_key.size() == 0 ? 0 : indices_of_by_key[0].size(); + const DataType* on_key_type = NULLPTR; + std::vector by_key_type(n_by, NULLPTR); // Take all non-key, non-time RHS fields - for (size_t j = 0; j < inputs.size(); ++j) { - const auto& input_schema = inputs[j]->output_schema(); - const auto& on_field_ix = input_schema->GetFieldIndex(on_field_name); - const auto& by_field_ix = input_schema->GetFieldIndex(by_field_name); + for (size_t j = 0; j < input_schema.size(); ++j) { + const auto& on_field_ix = indices_of_on_key[j]; + const auto& by_field_ix = indices_of_by_key[j]; - if ((on_field_ix == -1) | (by_field_ix == -1)) { + if ((on_field_ix == -1) || std_has(by_field_ix, -1)) { return Status::Invalid("Missing join key on table ", j); } - for (int i = 0; i < input_schema->num_fields(); ++i) { - const auto field = input_schema->field(i); - if (field->name() == on_field_name) { - if (kSupportedOnTypes_.find(field->type()) == kSupportedOnTypes_.end()) { - return Status::Invalid("Unsupported type for on key: ", field->name()); - } + const auto& on_field = input_schema[j]->fields()[on_field_ix]; + std::vector by_field(n_by); + for (size_t k = 0; k < n_by; k++) { + by_field[k] = input_schema[j]->fields()[by_field_ix[k]].get(); + } + + if (on_key_type == NULLPTR) { + on_key_type = on_field->type().get(); + } else if (*on_key_type != *on_field->type()) { + return Status::Invalid("Expected on-key type ", *on_key_type, " but got ", + *on_field->type(), " for field ", on_field->name(), + " in input ", j); + } + for (size_t k = 0; k < n_by; k++) { + if (by_key_type[k] == NULLPTR) { + by_key_type[k] = by_field[k]->type().get(); + } else if (*by_key_type[k] != *by_field[k]->type()) { + return Status::Invalid("Expected on-key type ", *by_key_type[k], " but got ", + *by_field[k]->type(), " for field ", by_field[k]->name(), + " in input ", j); + } + } + + for (int i = 0; i < input_schema[j]->num_fields(); ++i) { + const auto field = input_schema[j]->field(i); + if (i == on_field_ix) { + ARROW_RETURN_NOT_OK(is_valid_on_field(field)); // Only add on field from the left table if (j == 0) { fields.push_back(field); } - } else if (field->name() == by_field_name) { - if (kSupportedByTypes_.find(field->type()) == kSupportedByTypes_.end()) { - return Status::Invalid("Unsupported type for by key: ", field->name()); - } + } else if (std_has(by_field_ix, i)) { + ARROW_RETURN_NOT_OK(is_valid_by_field(field)); // Only add by field from the left table if (j == 0) { fields.push_back(field); } } else { - if (kSupportedDataTypes_.find(field->type()) == kSupportedDataTypes_.end()) { - return Status::Invalid("Unsupported data type: ", field->name()); - } - + ARROW_RETURN_NOT_OK(is_valid_data_field(field)); fields.push_back(field); } } @@ -657,45 +1208,131 @@ class AsofJoinNode : public ExecNode { return std::make_shared(fields); } + static inline Result FindColIndex(const Schema& schema, + const FieldRef& field_ref, + std::string_view key_kind) { + auto match_res = field_ref.FindOne(schema); + if (!match_res.ok()) { + return Status::Invalid("Bad join key on table : ", match_res.status().message()); + } + ARROW_ASSIGN_OR_RAISE(auto match, match_res); + if (match.indices().size() != 1) { + return Status::Invalid("AsOfJoinNode does not support a nested ", key_kind, "-key ", + field_ref.ToString()); + } + return match.indices()[0]; + } + + static Result GetByKeySize( + const std::vector& input_keys) { + size_t n_by = 0; + for (size_t i = 0; i < input_keys.size(); ++i) { + const auto& by_key = input_keys[i].by_key; + if (i == 0) { + n_by = by_key.size(); + } else if (n_by != by_key.size()) { + return Status::Invalid("inconsistent size of by-key across inputs"); + } + } + return n_by; + } + + static Result> GetIndicesOfOnKey( + const std::vector>& input_schema, + const std::vector& input_keys) { + if (input_schema.size() != input_keys.size()) { + return Status::Invalid("mismatching number of input schema and keys"); + } + size_t n_input = input_schema.size(); + std::vector indices_of_on_key(n_input); + for (size_t i = 0; i < n_input; ++i) { + const auto& on_key = input_keys[i].on_key; + ARROW_ASSIGN_OR_RAISE(indices_of_on_key[i], + FindColIndex(*input_schema[i], on_key, "on")); + } + return indices_of_on_key; + } + + static Result>> GetIndicesOfByKey( + const std::vector>& input_schema, + const std::vector& input_keys) { + if (input_schema.size() != input_keys.size()) { + return Status::Invalid("mismatching number of input schema and keys"); + } + ARROW_ASSIGN_OR_RAISE(size_t n_by, GetByKeySize(input_keys)); + size_t n_input = input_schema.size(); + std::vector> indices_of_by_key( + n_input, std::vector(n_by)); + for (size_t i = 0; i < n_input; ++i) { + for (size_t k = 0; k < n_by; k++) { + const auto& by_key = input_keys[i].by_key; + ARROW_ASSIGN_OR_RAISE(indices_of_by_key[i][k], + FindColIndex(*input_schema[i], by_key[k], "by")); + } + } + return indices_of_by_key; + } + static arrow::Result Make(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { DCHECK_GE(inputs.size(), 2) << "Must have at least two inputs"; - const auto& join_options = checked_cast(options); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr output_schema, - MakeOutputSchema(inputs, join_options)); - - std::vector input_labels(inputs.size()); - input_labels[0] = "left"; - for (size_t i = 1; i < inputs.size(); ++i) { - input_labels[i] = "right_" + std::to_string(i); + ARROW_ASSIGN_OR_RAISE(size_t n_by, GetByKeySize(join_options.input_keys)); + size_t n_input = inputs.size(); + std::vector input_labels(n_input); + std::vector> input_schema(n_input); + for (size_t i = 0; i < n_input; ++i) { + input_labels[i] = i == 0 ? "left" : "right_" + ToChars(i); + input_schema[i] = inputs[i]->output_schema(); } - - return plan->EmplaceNode(plan, inputs, std::move(input_labels), - join_options, std::move(output_schema)); + ARROW_ASSIGN_OR_RAISE(std::vector indices_of_on_key, + GetIndicesOfOnKey(input_schema, join_options.input_keys)); + ARROW_ASSIGN_OR_RAISE(std::vector> indices_of_by_key, + GetIndicesOfByKey(input_schema, join_options.input_keys)); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr output_schema, + MakeOutputSchema(input_schema, indices_of_on_key, indices_of_by_key)); + + std::vector> key_hashers; + for (size_t i = 0; i < n_input; i++) { + key_hashers.push_back(std::make_unique(indices_of_by_key[i])); + } + bool must_hash = + n_by > 1 || + (n_by == 1 && + !is_primitive( + inputs[0]->output_schema()->field(indices_of_by_key[0][0])->type()->id())); + bool may_rehash = n_by == 1 && !must_hash; + return plan->EmplaceNode( + plan, inputs, std::move(input_labels), std::move(indices_of_on_key), + std::move(indices_of_by_key), TolType(join_options.tolerance), + std::move(output_schema), std::move(key_hashers), must_hash, may_rehash); } const char* kind_name() const override { return "AsofJoinNode"; } void InputReceived(ExecNode* input, ExecBatch batch) override { // Get the input - ARROW_DCHECK(std::find(inputs_.begin(), inputs_.end(), input) != inputs_.end()); - size_t k = std::find(inputs_.begin(), inputs_.end(), input) - inputs_.begin(); + ARROW_DCHECK(std_has(inputs_, input)); + size_t k = std_find(inputs_, input) - inputs_.begin(); // Put into the queue auto rb = *batch.ToRecordBatch(input->output_schema()); - state_.at(k)->Push(rb); + Status st = state_.at(k)->Push(rb); + if (!st.ok()) { + ErrorReceived(input, st); + return; + } process_.Push(true); } void ErrorReceived(ExecNode* input, Status error) override { outputs_[0]->ErrorReceived(this, std::move(error)); - StopProducing(); } void InputFinished(ExecNode* input, int total_batches) override { { std::lock_guard guard(gate_); - ARROW_DCHECK(std::find(inputs_.begin(), inputs_.end(), input) != inputs_.end()); - size_t k = std::find(inputs_.begin(), inputs_.end(), input) - inputs_.begin(); + ARROW_DCHECK(std_has(inputs_, input)); + size_t k = std_find(inputs_, input) - inputs_.begin(); state_.at(k)->set_total_batches(total_batches); } // Trigger a process call @@ -704,30 +1341,30 @@ class AsofJoinNode : public ExecNode { // finished. process_.Push(true); } - Status StartProducing() override { - finished_ = arrow::Future<>::Make(); - return Status::OK(); - } + Status StartProducing() override { return Status::OK(); } void PauseProducing(ExecNode* output, int32_t counter) override {} void ResumeProducing(ExecNode* output, int32_t counter) override {} void StopProducing(ExecNode* output) override { DCHECK_EQ(output, outputs_[0]); StopProducing(); } - void StopProducing() override { finished_.MarkFinished(); } + void StopProducing() override { + process_.Clear(); + process_.Push(false); + } arrow::Future<> finished() override { return finished_; } private: - static const std::set> kSupportedOnTypes_; - static const std::set> kSupportedByTypes_; - static const std::set> kSupportedDataTypes_; - - arrow::Future<> finished_; + std::vector indices_of_on_key_; + std::vector> indices_of_by_key_; + std::vector> key_hashers_; + bool must_hash_; + bool may_rehash_; // InputStates // Each input state correponds to an input table std::vector> state_; std::mutex gate_; - AsofJoinNodeOptions options_; + TolType tolerance_; // Queue for triggering processing of a given input // (a false value is a poison pill) @@ -741,29 +1378,22 @@ class AsofJoinNode : public ExecNode { AsofJoinNode::AsofJoinNode(ExecPlan* plan, NodeVector inputs, std::vector input_labels, - const AsofJoinNodeOptions& join_options, - std::shared_ptr output_schema) + const std::vector& indices_of_on_key, + const std::vector>& indices_of_by_key, + TolType tolerance, std::shared_ptr output_schema, + std::vector> key_hashers, + bool must_hash, bool may_rehash) : ExecNode(plan, inputs, input_labels, /*output_schema=*/std::move(output_schema), /*num_outputs=*/1), - options_(join_options), + indices_of_on_key_(std::move(indices_of_on_key)), + indices_of_by_key_(std::move(indices_of_by_key)), + key_hashers_(std::move(key_hashers)), + must_hash_(must_hash), + may_rehash_(may_rehash), + tolerance_(tolerance), process_(), - process_thread_(&AsofJoinNode::ProcessThreadWrapper, this) { - for (size_t i = 0; i < inputs.size(); ++i) - state_.push_back(::arrow::internal::make_unique( - inputs[i]->output_schema(), *options_.on_key.name(), *options_.by_key.name())); - col_index_t dst_offset = 0; - for (auto& state : state_) - dst_offset = state->InitSrcToDstMapping(dst_offset, !!dst_offset); - - finished_ = arrow::Future<>::MakeFinished(); -} - -// Currently supported types -const std::set> AsofJoinNode::kSupportedOnTypes_ = {int64()}; -const std::set> AsofJoinNode::kSupportedByTypes_ = {int32()}; -const std::set> AsofJoinNode::kSupportedDataTypes_ = { - int32(), int64(), float32(), float64()}; + process_thread_(&AsofJoinNode::ProcessThreadWrapper, this) {} namespace internal { void RegisterAsofJoinNode(ExecFactoryRegistry* registry) { @@ -771,5 +1401,20 @@ void RegisterAsofJoinNode(ExecFactoryRegistry* registry) { } } // namespace internal +namespace asofjoin { + +Result> MakeOutputSchema( + const std::vector>& input_schema, + const std::vector& input_keys) { + ARROW_ASSIGN_OR_RAISE(std::vector indices_of_on_key, + AsofJoinNode::GetIndicesOfOnKey(input_schema, input_keys)); + ARROW_ASSIGN_OR_RAISE(std::vector> indices_of_by_key, + AsofJoinNode::GetIndicesOfByKey(input_schema, input_keys)); + return AsofJoinNode::MakeOutputSchema(input_schema, indices_of_on_key, + indices_of_by_key); +} + +} // namespace asofjoin + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/asof_join_node.h b/cpp/src/arrow/compute/exec/asof_join_node.h new file mode 100644 index 00000000000..27777090d3d --- /dev/null +++ b/cpp/src/arrow/compute/exec/asof_join_node.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/options.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { +namespace asofjoin { + +using AsofJoinKeys = AsofJoinNodeOptions::Keys; + +ARROW_EXPORT Result> MakeOutputSchema( + const std::vector>& input_schema, + const std::vector& input_keys); + +} // namespace asofjoin +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/asof_join_node_test.cc b/cpp/src/arrow/compute/exec/asof_join_node_test.cc index 8b993764abe..6968aa03c9d 100644 --- a/cpp/src/arrow/compute/exec/asof_join_node_test.cc +++ b/cpp/src/arrow/compute/exec/asof_join_node_test.cc @@ -17,11 +17,15 @@ #include +#include +#include #include #include +#include #include #include "arrow/api.h" +#include "arrow/compute/api_scalar.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/util.h" @@ -31,24 +35,203 @@ #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" +#define TRACED_TEST(t_class, t_name, t_body) \ + TEST(t_class, t_name) { \ + ARROW_SCOPED_TRACE(#t_class "_" #t_name); \ + t_body; \ + } + +#define TRACED_TEST_P(t_class, t_name, t_body) \ + TEST_P(t_class, t_name) { \ + ARROW_SCOPED_TRACE(#t_class "_" #t_name "_" + std::get<1>(GetParam())); \ + t_body; \ + } + using testing::UnorderedElementsAreArray; namespace arrow { namespace compute { +bool is_temporal_primitive(Type::type type_id) { + switch (type_id) { + case Type::TIME32: + case Type::TIME64: + case Type::DATE32: + case Type::DATE64: + case Type::TIMESTAMP: + return true; + default: + return false; + } +} + +Result MakeBatchesFromNumString( + const std::shared_ptr& schema, + const std::vector& json_strings, int multiplicity = 1) { + FieldVector num_fields; + for (auto field : schema->fields()) { + auto id = field->type()->id(); + bool adjust = id == Type::BOOL || is_base_binary_like(id); + num_fields.push_back(adjust ? field->WithType(int64()) : field); + } + auto num_schema = + std::make_shared(num_fields, schema->endianness(), schema->metadata()); + BatchesWithSchema num_batches = + MakeBatchesFromString(num_schema, json_strings, multiplicity); + BatchesWithSchema batches; + batches.schema = schema; + int n_fields = schema->num_fields(); + for (auto num_batch : num_batches.batches) { + Datum two(Int32Scalar(2)); + std::vector values; + for (int i = 0; i < n_fields; i++) { + auto type = schema->field(i)->type(); + if (is_base_binary_like(type->id())) { + // casting to string first enables casting to binary + ARROW_ASSIGN_OR_RAISE(Datum as_string, Cast(num_batch.values[i], utf8())); + ARROW_ASSIGN_OR_RAISE(Datum as_type, Cast(as_string, type)); + values.push_back(as_type); + } else if (Type::BOOL == type->id()) { + // the next 4 lines compute `as_bool` as `(bool)(x - 2*(x/2))`, i.e., the low bit + // of `x`. Here, `x` stands for `num_batch.values[i]`, which is an `int64` value. + // Taking the low bit is a somewhat arbitrary way of obtaining both `true` and + // `false` values from the `int64` values in the test data, in order to get good + // testing coverage. A simple cast to a Boolean value would not get good coverage + // because all positive values would be cast to `true`. + ARROW_ASSIGN_OR_RAISE(Datum div_two, Divide(num_batch.values[i], two)); + ARROW_ASSIGN_OR_RAISE(Datum rounded, Multiply(div_two, two)); + ARROW_ASSIGN_OR_RAISE(Datum low_bit, Subtract(num_batch.values[i], rounded)); + ARROW_ASSIGN_OR_RAISE(Datum as_bool, Cast(low_bit, type)); + values.push_back(as_bool); + } else { + values.push_back(num_batch.values[i]); + } + } + ExecBatch batch(values, num_batch.length); + batches.batches.push_back(batch); + } + return batches; +} + +void BuildNullArray(std::shared_ptr& empty, const std::shared_ptr& type, + int64_t length) { + ASSERT_OK_AND_ASSIGN(auto builder, MakeBuilder(type, default_memory_pool())); + ASSERT_OK(builder->Reserve(length)); + ASSERT_OK(builder->AppendNulls(length)); + ASSERT_OK(builder->Finish(&empty)); +} + +void BuildZeroPrimitiveArray(std::shared_ptr& empty, + const std::shared_ptr& type, int64_t length) { + ASSERT_OK_AND_ASSIGN(auto builder, MakeBuilder(type, default_memory_pool())); + ASSERT_OK(builder->Reserve(length)); + ASSERT_OK_AND_ASSIGN(auto scalar, MakeScalar(type, 0)); + ASSERT_OK(builder->AppendScalar(*scalar, length)); + ASSERT_OK(builder->Finish(&empty)); +} + +template +void BuildZeroBaseBinaryArray(std::shared_ptr& empty, int64_t length) { + Builder builder(default_memory_pool()); + ASSERT_OK(builder.Reserve(length)); + for (int64_t i = 0; i < length; i++) { + ASSERT_OK(builder.Append("0", /*length=*/1)); + } + ASSERT_OK(builder.Finish(&empty)); +} + +AsofJoinNodeOptions GetRepeatedOptions(size_t repeat, FieldRef on_key, + std::vector by_key, int64_t tolerance) { + std::vector input_keys(repeat); + for (size_t i = 0; i < repeat; i++) { + input_keys[i] = {on_key, by_key}; + } + return AsofJoinNodeOptions(input_keys, tolerance); +} + +// mutates by copying from_key into to_key and changing from_key to zero +Result MutateByKey(BatchesWithSchema& batches, std::string from_key, + std::string to_key, bool replace_key = false, + bool null_key = false, bool remove_key = false) { + int from_index = batches.schema->GetFieldIndex(from_key); + int n_fields = batches.schema->num_fields(); + auto fields = batches.schema->fields(); + BatchesWithSchema new_batches; + if (remove_key) { + ARROW_ASSIGN_OR_RAISE(new_batches.schema, batches.schema->RemoveField(from_index)); + } else { + auto new_field = batches.schema->field(from_index)->WithName(to_key); + ARROW_ASSIGN_OR_RAISE(new_batches.schema, + replace_key ? batches.schema->SetField(from_index, new_field) + : batches.schema->AddField(from_index, new_field)); + } + for (const ExecBatch& batch : batches.batches) { + std::vector new_values; + for (int i = 0; i < n_fields; i++) { + const Datum& value = batch.values[i]; + if (i == from_index) { + if (remove_key) { + continue; + } + auto type = fields[i]->type(); + if (null_key) { + std::shared_ptr empty; + BuildNullArray(empty, type, batch.length); + new_values.push_back(empty); + } else if (is_primitive(type->id())) { + std::shared_ptr empty; + BuildZeroPrimitiveArray(empty, type, batch.length); + new_values.push_back(empty); + } else if (is_base_binary_like(type->id())) { + std::shared_ptr empty; + switch (type->id()) { + case Type::STRING: + BuildZeroBaseBinaryArray(empty, batch.length); + break; + case Type::LARGE_STRING: + BuildZeroBaseBinaryArray(empty, batch.length); + break; + case Type::BINARY: + BuildZeroBaseBinaryArray(empty, batch.length); + break; + case Type::LARGE_BINARY: + BuildZeroBaseBinaryArray(empty, batch.length); + break; + default: + DCHECK(false); + break; + } + new_values.push_back(empty); + } else { + ARROW_ASSIGN_OR_RAISE(auto sub, Subtract(value, value)); + new_values.push_back(sub); + } + if (replace_key) { + continue; + } + } + new_values.push_back(value); + } + new_batches.batches.emplace_back(new_values, batch.length); + } + return new_batches; +} + +// code generation for the by_key types supported by AsofJoinNodeOptions constructors +// which cannot be directly done using templates because of failure to deduce the template +// argument for an invocation with a string- or initializer_list-typed keys-argument +#define EXPAND_BY_KEY_TYPE(macro) \ + macro(const FieldRef); \ + macro(std::vector); \ + macro(std::initializer_list); + void CheckRunOutput(const BatchesWithSchema& l_batches, const BatchesWithSchema& r0_batches, const BatchesWithSchema& r1_batches, - const BatchesWithSchema& exp_batches, const FieldRef time, - const FieldRef keys, const int64_t tolerance) { - auto exec_ctx = - arrow::internal::make_unique(default_memory_pool(), nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - - AsofJoinNodeOptions join_options(time, keys, tolerance); + const BatchesWithSchema& exp_batches, + const AsofJoinNodeOptions join_options) { Declaration join{"asofjoin", join_options}; join.inputs.emplace_back(Declaration{ @@ -58,253 +241,1027 @@ void CheckRunOutput(const BatchesWithSchema& l_batches, join.inputs.emplace_back(Declaration{ "source", SourceNodeOptions{r1_batches.schema, r1_batches.gen(false, false)}}); - AsyncGenerator> sink_gen; - - ASSERT_OK(Declaration::Sequence({join, {"sink", SinkNodeOptions{&sink_gen}}}) - .AddToPlan(plan.get())); - - ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto res_table, + DeclarationToTable(std::move(join), /*use_threads=*/false)); ASSERT_OK_AND_ASSIGN(auto exp_table, TableFromExecBatches(exp_batches.schema, exp_batches.batches)); - ASSERT_OK_AND_ASSIGN(auto res_table, TableFromExecBatches(exp_batches.schema, res)); - AssertTablesEqual(*exp_table, *res_table, /*same_chunk_layout=*/true, /*flatten=*/true); } -void DoRunBasicTest(const std::vector& l_data, - const std::vector& r0_data, - const std::vector& r1_data, - const std::vector& exp_data, int64_t tolerance) { - auto l_schema = - schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}); - auto r0_schema = - schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())}); - auto r1_schema = - schema({field("time", int64()), field("key", int32()), field("r1_v0", float32())}); - - auto exp_schema = schema({ - field("time", int64()), - field("key", int32()), - field("l_v0", float64()), - field("r0_v0", float64()), - field("r1_v0", float32()), - }); - - // Test three table join - BatchesWithSchema l_batches, r0_batches, r1_batches, exp_batches; - l_batches = MakeBatchesFromString(l_schema, l_data); - r0_batches = MakeBatchesFromString(r0_schema, r0_data); - r1_batches = MakeBatchesFromString(r1_schema, r1_data); - exp_batches = MakeBatchesFromString(exp_schema, exp_data); - CheckRunOutput(l_batches, r0_batches, r1_batches, exp_batches, "time", "key", - tolerance); -} +#define CHECK_RUN_OUTPUT(by_key_type) \ + void CheckRunOutput( \ + const BatchesWithSchema& l_batches, const BatchesWithSchema& r0_batches, \ + const BatchesWithSchema& r1_batches, const BatchesWithSchema& exp_batches, \ + const FieldRef time, by_key_type key, const int64_t tolerance) { \ + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_batches, \ + GetRepeatedOptions(3, time, {key}, tolerance)); \ + } -void DoRunInvalidTypeTest(const std::shared_ptr& l_schema, - const std::shared_ptr& r_schema) { - BatchesWithSchema l_batches = MakeBatchesFromString(l_schema, {R"([])"}); - BatchesWithSchema r_batches = MakeBatchesFromString(r_schema, {R"([])"}); +EXPAND_BY_KEY_TYPE(CHECK_RUN_OUTPUT) - ExecContext exec_ctx; - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx)); +void DoInvalidPlanTest(const BatchesWithSchema& l_batches, + const BatchesWithSchema& r_batches, + const AsofJoinNodeOptions& join_options, + const std::string& expected_error_str, + bool fail_on_plan_creation = false) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(*threaded_exec_context())); - AsofJoinNodeOptions join_options("time", "key", 0); Declaration join{"asofjoin", join_options}; join.inputs.emplace_back(Declaration{ "source", SourceNodeOptions{l_batches.schema, l_batches.gen(false, false)}}); join.inputs.emplace_back(Declaration{ "source", SourceNodeOptions{r_batches.schema, r_batches.gen(false, false)}}); - ASSERT_RAISES(Invalid, join.AddToPlan(plan.get())); + if (fail_on_plan_creation) { + AsyncGenerator> sink_gen; + ASSERT_OK(Declaration::Sequence({join, {"sink", SinkNodeOptions{&sink_gen}}}) + .AddToPlan(plan.get())); + EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(Invalid, + ::testing::HasSubstr(expected_error_str), + StartAndCollect(plan.get(), sink_gen)); + } else { + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(expected_error_str), + join.AddToPlan(plan.get())); + } +} + +void DoRunInvalidPlanTest(const BatchesWithSchema& l_batches, + const BatchesWithSchema& r_batches, + const AsofJoinNodeOptions& join_options, + const std::string& expected_error_str) { + DoInvalidPlanTest(l_batches, r_batches, join_options, expected_error_str); +} + +void DoRunInvalidPlanTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema, + const AsofJoinNodeOptions& join_options, + const std::string& expected_error_str) { + ASSERT_OK_AND_ASSIGN(auto l_batches, MakeBatchesFromNumString(l_schema, {R"([])"})); + ASSERT_OK_AND_ASSIGN(auto r_batches, MakeBatchesFromNumString(r_schema, {R"([])"})); + + return DoRunInvalidPlanTest(l_batches, r_batches, join_options, expected_error_str); +} + +void DoRunInvalidPlanTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema, int64_t tolerance, + const std::string& expected_error_str) { + DoRunInvalidPlanTest(l_schema, r_schema, + GetRepeatedOptions(2, "time", {"key"}, tolerance), + expected_error_str); +} + +void DoRunInvalidTypeTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, 0, "Unsupported type for "); +} + +void DoRunMissingKeysTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, 0, "Bad join key on table : No match"); +} + +void DoRunMissingOnKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, + GetRepeatedOptions(2, "invalid_time", {"key"}, 0), + "Bad join key on table : No match"); +} + +void DoRunMissingByKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, + GetRepeatedOptions(2, "time", {"invalid_key"}, 0), + "Bad join key on table : No match"); } +void DoRunNestedOnKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, GetRepeatedOptions(2, {0, "time"}, {"key"}, 0), + "Bad join key on table : No match"); +} + +void DoRunNestedByKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, + GetRepeatedOptions(2, "time", {FieldRef{0, 1}}, 0), + "Bad join key on table : No match"); +} + +void DoRunAmbiguousOnKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, 0, "Bad join key on table : Multiple matches"); +} + +void DoRunAmbiguousByKeyTest(const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunInvalidPlanTest(l_schema, r_schema, 0, "Bad join key on table : Multiple matches"); +} + +// Gets a batch for testing as a Json string +// The batch will have n_rows rows n_cols columns, the first column being the on-field +// If unordered is true then the first column will be out-of-order +std::string GetTestBatchAsJsonString(int n_rows, int n_cols, bool unordered = false) { + int order_mask = unordered ? 1 : 0; + std::stringstream s; + s << '['; + for (int i = 0; i < n_rows; i++) { + if (i > 0) { + s << ", "; + } + s << '['; + for (int j = 0; j < n_cols; j++) { + if (j > 0) { + s << ", " << j; + } else if (j < 2) { + s << (i ^ order_mask); + } else { + s << i; + } + } + s << ']'; + } + s << ']'; + return s.str(); +} + +void DoRunUnorderedPlanTest(bool l_unordered, bool r_unordered, + const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema, + const AsofJoinNodeOptions& join_options, + const std::string& expected_error_str) { + ASSERT_TRUE(l_unordered || r_unordered); + int n_rows = 5; + auto l_str = GetTestBatchAsJsonString(n_rows, l_schema->num_fields(), l_unordered); + auto r_str = GetTestBatchAsJsonString(n_rows, r_schema->num_fields(), r_unordered); + ASSERT_OK_AND_ASSIGN(auto l_batches, MakeBatchesFromNumString(l_schema, {l_str})); + ASSERT_OK_AND_ASSIGN(auto r_batches, MakeBatchesFromNumString(r_schema, {r_str})); + + return DoInvalidPlanTest(l_batches, r_batches, join_options, expected_error_str, + /*then_run_plan=*/true); +} + +void DoRunUnorderedPlanTest(bool l_unordered, bool r_unordered, + const std::shared_ptr& l_schema, + const std::shared_ptr& r_schema) { + DoRunUnorderedPlanTest(l_unordered, r_unordered, l_schema, r_schema, + GetRepeatedOptions(2, "time", {"key"}, 1000), + "out-of-order on-key values"); +} + +struct BasicTestTypes { + std::shared_ptr time, key, l_val, r0_val, r1_val; +}; + +struct BasicTest { + BasicTest(const std::vector& l_data, + const std::vector& r0_data, + const std::vector& r1_data, + const std::vector& exp_nokey_data, + const std::vector& exp_emptykey_data, + const std::vector& exp_data, int64_t tolerance) + : l_data(std::move(l_data)), + r0_data(std::move(r0_data)), + r1_data(std::move(r1_data)), + exp_nokey_data(std::move(exp_nokey_data)), + exp_emptykey_data(std::move(exp_emptykey_data)), + exp_data(std::move(exp_data)), + tolerance(tolerance) {} + + static inline void check_init(const std::vector>& types) { + ASSERT_NE(0, types.size()); + } + + template + static inline std::vector> init_types( + const std::vector>& all_types, TypeCond type_cond) { + std::vector> types; + for (auto type : all_types) { + if (type_cond(type)) { + types.push_back(type); + } + } + check_init(types); + return types; + } + + void RunSingleByKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_batches, "time", "key", + tolerance); + }); + } + static void DoSingleByKey(BasicTest& basic_tests) { basic_tests.RunSingleByKey(); } + void RunDoubleByKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_batches, "time", + {"key", "key"}, tolerance); + }); + } + static void DoDoubleByKey(BasicTest& basic_tests) { basic_tests.RunDoubleByKey(); } + void RunMutateByKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + ASSERT_OK_AND_ASSIGN(l_batches, MutateByKey(l_batches, "key", "key2")); + ASSERT_OK_AND_ASSIGN(r0_batches, MutateByKey(r0_batches, "key", "key2")); + ASSERT_OK_AND_ASSIGN(r1_batches, MutateByKey(r1_batches, "key", "key2")); + ASSERT_OK_AND_ASSIGN(exp_batches, MutateByKey(exp_batches, "key", "key2")); + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_batches, "time", + {"key", "key2"}, tolerance); + }); + } + static void DoMutateByKey(BasicTest& basic_tests) { basic_tests.RunMutateByKey(); } + void RunMutateNoKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + ASSERT_OK_AND_ASSIGN(l_batches, MutateByKey(l_batches, "key", "key2", true)); + ASSERT_OK_AND_ASSIGN(r0_batches, MutateByKey(r0_batches, "key", "key2", true)); + ASSERT_OK_AND_ASSIGN(r1_batches, MutateByKey(r1_batches, "key", "key2", true)); + ASSERT_OK_AND_ASSIGN(exp_nokey_batches, + MutateByKey(exp_nokey_batches, "key", "key2", true)); + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_nokey_batches, "time", "key2", + tolerance); + }); + } + static void DoMutateNoKey(BasicTest& basic_tests) { basic_tests.RunMutateNoKey(); } + void RunMutateNullKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + ASSERT_OK_AND_ASSIGN(l_batches, MutateByKey(l_batches, "key", "key2", true, true)); + ASSERT_OK_AND_ASSIGN(r0_batches, + MutateByKey(r0_batches, "key", "key2", true, true)); + ASSERT_OK_AND_ASSIGN(r1_batches, + MutateByKey(r1_batches, "key", "key2", true, true)); + ASSERT_OK_AND_ASSIGN(exp_nokey_batches, + MutateByKey(exp_nokey_batches, "key", "key2", true, true)); + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_nokey_batches, + GetRepeatedOptions(3, "time", {"key2"}, tolerance)); + }); + } + static void DoMutateNullKey(BasicTest& basic_tests) { basic_tests.RunMutateNullKey(); } + void RunMutateEmptyKey() { + using B = BatchesWithSchema; + RunBatches([this](B l_batches, B r0_batches, B r1_batches, B exp_nokey_batches, + B exp_emptykey_batches, B exp_batches) { + ASSERT_OK_AND_ASSIGN(r0_batches, + MutateByKey(r0_batches, "key", "key", false, false, true)); + ASSERT_OK_AND_ASSIGN(r1_batches, + MutateByKey(r1_batches, "key", "key", false, false, true)); + CheckRunOutput(l_batches, r0_batches, r1_batches, exp_emptykey_batches, + GetRepeatedOptions(3, "time", {}, tolerance)); + }); + } + static void DoMutateEmptyKey(BasicTest& basic_tests) { + basic_tests.RunMutateEmptyKey(); + } + template + void RunBatches(BatchesRunner batches_runner) { + std::vector> all_types = { + utf8(), + large_utf8(), + binary(), + large_binary(), + boolean(), + int8(), + int16(), + int32(), + int64(), + uint8(), + uint16(), + uint32(), + uint64(), + date32(), + date64(), + time32(TimeUnit::MILLI), + time32(TimeUnit::SECOND), + time64(TimeUnit::NANO), + time64(TimeUnit::MICRO), + timestamp(TimeUnit::NANO, "UTC"), + timestamp(TimeUnit::MICRO, "UTC"), + timestamp(TimeUnit::MILLI, "UTC"), + timestamp(TimeUnit::SECOND, "UTC"), + float32(), + float64()}; + using T = const std::shared_ptr; + // byte_width > 1 below allows fitting the tested data + auto time_types = init_types( + all_types, [](T& t) { return t->byte_width() > 1 && !is_floating(t->id()); }); + auto key_types = init_types( + all_types, [](T& t) { return !is_floating(t->id()) && t->id() != Type::BOOL; }); + auto l_types = init_types(all_types, [](T& t) { return true; }); + auto r0_types = init_types(all_types, [](T& t) { return t->byte_width() > 1; }); + auto r1_types = init_types(all_types, [](T& t) { return t->byte_width() > 1; }); + + // sample a limited number of type-combinations to keep the runnning time reasonable + // the scoped-traces below help reproduce a test failure, should it happen + auto start_time = std::chrono::system_clock::now(); + auto seed = start_time.time_since_epoch().count(); + ARROW_SCOPED_TRACE("Types seed: ", seed); + std::default_random_engine engine(static_cast(seed)); + std::uniform_int_distribution time_distribution(0, time_types.size() - 1); + std::uniform_int_distribution key_distribution(0, key_types.size() - 1); + std::uniform_int_distribution l_distribution(0, l_types.size() - 1); + std::uniform_int_distribution r0_distribution(0, r0_types.size() - 1); + std::uniform_int_distribution r1_distribution(0, r1_types.size() - 1); + + for (int i = 0; i < 100; i++) { + auto time_type = time_types[time_distribution(engine)]; + ARROW_SCOPED_TRACE("Time type: ", *time_type); + auto key_type = key_types[key_distribution(engine)]; + ARROW_SCOPED_TRACE("Key type: ", *key_type); + auto l_type = l_types[l_distribution(engine)]; + ARROW_SCOPED_TRACE("Left type: ", *l_type); + auto r0_type = r0_types[r0_distribution(engine)]; + ARROW_SCOPED_TRACE("Right-0 type: ", *r0_type); + auto r1_type = r1_types[r1_distribution(engine)]; + ARROW_SCOPED_TRACE("Right-1 type: ", *r1_type); + + RunTypes({time_type, key_type, l_type, r0_type, r1_type}, batches_runner); + + auto end_time = std::chrono::system_clock::now(); + std::chrono::duration diff = end_time - start_time; + if (diff.count() > 0.2) { + break; + } + } + } + template + void RunTypes(BasicTestTypes basic_test_types, BatchesRunner batches_runner) { + const BasicTestTypes& b = basic_test_types; + auto l_schema = + schema({field("time", b.time), field("key", b.key), field("l_v0", b.l_val)}); + auto r0_schema = + schema({field("time", b.time), field("key", b.key), field("r0_v0", b.r0_val)}); + auto r1_schema = + schema({field("time", b.time), field("key", b.key), field("r1_v0", b.r1_val)}); + + auto exp_schema = schema({ + field("time", b.time), + field("key", b.key), + field("l_v0", b.l_val), + field("r0_v0", b.r0_val), + field("r1_v0", b.r1_val), + }); + + // Test three table join + ASSERT_OK_AND_ASSIGN(auto l_batches, MakeBatchesFromNumString(l_schema, l_data)); + ASSERT_OK_AND_ASSIGN(auto r0_batches, MakeBatchesFromNumString(r0_schema, r0_data)); + ASSERT_OK_AND_ASSIGN(auto r1_batches, MakeBatchesFromNumString(r1_schema, r1_data)); + ASSERT_OK_AND_ASSIGN(auto exp_nokey_batches, + MakeBatchesFromNumString(exp_schema, exp_nokey_data)); + ASSERT_OK_AND_ASSIGN(auto exp_emptykey_batches, + MakeBatchesFromNumString(exp_schema, exp_emptykey_data)); + ASSERT_OK_AND_ASSIGN(auto exp_batches, + MakeBatchesFromNumString(exp_schema, exp_data)); + batches_runner(l_batches, r0_batches, r1_batches, exp_nokey_batches, + exp_emptykey_batches, exp_batches); + } + + std::vector l_data; + std::vector r0_data; + std::vector r1_data; + std::vector exp_nokey_data; + std::vector exp_emptykey_data; + std::vector exp_data; + int64_t tolerance; +}; + +using AsofJoinBasicParams = std::tuple, std::string>; + +void PrintTo(const AsofJoinBasicParams& x, ::std::ostream* os) { + *os << "AsofJoinBasicParams: " << std::get<1>(x); +} + +struct AsofJoinBasicTest : public testing::TestWithParam {}; + class AsofJoinTest : public testing::Test {}; -TEST(AsofJoinTest, TestBasic1) { +BasicTest GetBasicTest1() { + // Single key, single batch + return BasicTest( + /*l*/ {R"([[0, 1, 1], [1000, 1, 2]])"}, + /*r0*/ {R"([[0, 1, 11]])"}, + /*r1*/ {R"([[1000, 1, 101]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, null], [1000, 0, 2, 11, 101]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, null], [1000, 1, 2, 11, 101]])"}, + /*exp*/ {R"([[0, 1, 1, 11, null], [1000, 1, 2, 11, 101]])"}, 1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic1, { + BasicTest basic_test = GetBasicTest1(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest1Negative() { // Single key, single batch - DoRunBasicTest( - /*l*/ {R"([[0, 1, 1.0], [1000, 1, 2.0]])"}, - /*r0*/ {R"([[0, 1, 11.0]])"}, - /*r1*/ {R"([[1000, 1, 101.0]])"}, - /*exp*/ {R"([[0, 1, 1.0, 11.0, null], [1000, 1, 2.0, 11.0, 101.0]])"}, 1000); + return BasicTest( + /*l*/ {R"([[0, 1, 1], [1000, 1, 2]])"}, + /*r0*/ {R"([[1000, 1, 11]])"}, + /*r1*/ {R"([[2000, 1, 101]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, null], [1000, 0, 2, 11, 101]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, null], [1000, 1, 2, 11, 101]])"}, + /*exp*/ {R"([[0, 1, 1, 11, null], [1000, 1, 2, 11, 101]])"}, -1000); } -TEST(AsofJoinTest, TestBasic2) { +TRACED_TEST_P(AsofJoinBasicTest, TestBasic1Negative, { + BasicTest basic_test = GetBasicTest1Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest2() { // Single key, multiple batches - DoRunBasicTest( - /*l*/ {R"([[0, 1, 1.0]])", R"([[1000, 1, 2.0]])"}, - /*r0*/ {R"([[0, 1, 11.0]])", R"([[1000, 1, 12.0]])"}, - /*r1*/ {R"([[0, 1, 101.0]])", R"([[1000, 1, 102.0]])"}, - /*exp*/ {R"([[0, 1, 1.0, 11.0, 101.0], [1000, 1, 2.0, 12.0, 102.0]])"}, 1000); + return BasicTest( + /*l*/ {R"([[0, 1, 1]])", R"([[1000, 1, 2]])"}, + /*r0*/ {R"([[0, 1, 11]])", R"([[1000, 1, 12]])"}, + /*r1*/ {R"([[0, 1, 101]])", R"([[1000, 1, 102]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, 101], [1000, 0, 2, 12, 102]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, + /*exp*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, 1000); } -TEST(AsofJoinTest, TestBasic3) { +TRACED_TEST_P(AsofJoinBasicTest, TestBasic2, { + BasicTest basic_test = GetBasicTest2(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest2Negative() { + // Single key, multiple batches + return BasicTest( + /*l*/ {R"([[0, 1, 1]])", R"([[1000, 1, 2]])"}, + /*r0*/ {R"([[500, 1, 11]])", R"([[1000, 1, 12]])"}, + /*r1*/ {R"([[500, 1, 101]])", R"([[1000, 1, 102]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, 101], [1000, 0, 2, 12, 102]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, + /*exp*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, -1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic2Negative, { + BasicTest basic_test = GetBasicTest2Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest3() { + // Single key, multiple left batches, single right batches + return BasicTest( + /*l*/ {R"([[0, 1, 1]])", R"([[1000, 1, 2]])"}, + /*r0*/ {R"([[0, 1, 11], [1000, 1, 12]])"}, + /*r1*/ {R"([[0, 1, 101], [1000, 1, 102]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, 101], [1000, 0, 2, 12, 102]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, + /*exp*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, 1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic3, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestBasic3_" + std::get<1>(GetParam())); + BasicTest basic_test = GetBasicTest3(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest3Negative() { // Single key, multiple left batches, single right batches - DoRunBasicTest( - /*l*/ {R"([[0, 1, 1.0]])", R"([[1000, 1, 2.0]])"}, - /*r0*/ {R"([[0, 1, 11.0], [1000, 1, 12.0]])"}, - /*r1*/ {R"([[0, 1, 101.0], [1000, 1, 102.0]])"}, - /*exp*/ {R"([[0, 1, 1.0, 11.0, 101.0], [1000, 1, 2.0, 12.0, 102.0]])"}, 1000); + return BasicTest( + /*l*/ {R"([[0, 1, 1]])", R"([[1000, 1, 2]])"}, + /*r0*/ {R"([[500, 1, 11], [1000, 1, 12]])"}, + /*r1*/ {R"([[500, 1, 101], [1000, 1, 102]])"}, + /*exp_nokey*/ {R"([[0, 0, 1, 11, 101], [1000, 0, 2, 12, 102]])"}, + /*exp_emptykey*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, + /*exp*/ {R"([[0, 1, 1, 11, 101], [1000, 1, 2, 12, 102]])"}, -1000); } -TEST(AsofJoinTest, TestBasic4) { +TRACED_TEST_P(AsofJoinBasicTest, TestBasic3Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestBasic3_" + std::get<1>(GetParam())); + BasicTest basic_test = GetBasicTest3Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest4() { // Multi key, multiple batches, misaligned batches - DoRunBasicTest( + return BasicTest( /*l*/ - {R"([[0, 1, 1.0], [0, 2, 21.0], [500, 1, 2.0], [1000, 2, 22.0], [1500, 1, 3.0], [1500, 2, 23.0]])", - R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, /*r0*/ - {R"([[0, 1, 11.0], [500, 2, 31.0], [1000, 1, 12.0]])", - R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, 11, 1001], [0, 0, 21, 11, 1001], [500, 0, 2, 31, 101], [1000, 0, 22, 12, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, 11, 1001], [0, 2, 21, 11, 1001], [500, 1, 2, 31, 101], [1000, 2, 22, 12, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, /*exp*/ - {R"([[0, 1, 1.0, 11.0, null], [0, 2, 21.0, null, 1001.0], [500, 1, 2.0, 11.0, 101.0], [1000, 2, 22.0, 31.0, 1001.0], [1500, 1, 3.0, 12.0, 102.0], [1500, 2, 23.0, 32.0, 1002.0]])", - R"([[2000, 1, 4.0, 13.0, 103.0], [2000, 2, 24.0, 32.0, 1002.0]])"}, + {R"([[0, 1, 1, 11, null], [0, 2, 21, null, 1001], [500, 1, 2, 11, 101], [1000, 2, 22, 31, 1001], [1500, 1, 3, 12, 102], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 32, 1002]])"}, 1000); } -TEST(AsofJoinTest, TestBasic5) { +TRACED_TEST_P(AsofJoinBasicTest, TestBasic4, { + BasicTest basic_test = GetBasicTest4(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest4Negative() { + // Multi key, multiple batches, misaligned batches + return BasicTest( + /*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1600, 2, 32], [1900, 2, 33], [2100, 1, 13]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1100, 1, 102], [1600, 2, 1002], [2100, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, 11, 1001], [0, 0, 21, 11, 1001], [500, 0, 2, 31, 101], [1000, 0, 22, 12, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, 11, 1001], [0, 2, 21, 11, 1001], [500, 1, 2, 31, 101], [1000, 2, 22, 12, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, 11, 101], [0, 2, 21, 31, 1001], [500, 1, 2, 12, 101], [1000, 2, 22, 32, 1002], [1500, 1, 3, 13, 103], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, null, null]])"}, + -1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic4Negative, { + BasicTest basic_test = GetBasicTest4Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest5() { // Multi key, multiple batches, misaligned batches, smaller tolerance - DoRunBasicTest(/*l*/ - {R"([[0, 1, 1.0], [0, 2, 21.0], [500, 1, 2.0], [1000, 2, 22.0], [1500, 1, 3.0], [1500, 2, 23.0]])", - R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, - /*r0*/ - {R"([[0, 1, 11.0], [500, 2, 31.0], [1000, 1, 12.0]])", - R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([[0, 1, 1.0, 11.0, null], [0, 2, 21.0, null, 1001.0], [500, 1, 2.0, 11.0, 101.0], [1000, 2, 22.0, 31.0, null], [1500, 1, 3.0, 12.0, 102.0], [1500, 2, 23.0, 32.0, 1002.0]])", - R"([[2000, 1, 4.0, 13.0, 103.0], [2000, 2, 24.0, 32.0, 1002.0]])"}, - 500); -} - -TEST(AsofJoinTest, TestBasic6) { + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, 11, 1001], [0, 0, 21, 11, 1001], [500, 0, 2, 31, 101], [1000, 0, 22, 12, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, 11, 1001], [0, 2, 21, 11, 1001], [500, 1, 2, 31, 101], [1000, 2, 22, 12, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, 11, null], [0, 2, 21, null, 1001], [500, 1, 2, 11, 101], [1000, 2, 22, 31, null], [1500, 1, 3, 12, 102], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 32, 1002]])"}, + 500); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic5, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestBasic5_" + std::get<1>(GetParam())); + BasicTest basic_test = GetBasicTest5(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest5Negative() { + // Multi key, multiple batches, misaligned batches, smaller tolerance + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, 11, 1001], [0, 0, 21, 11, 1001], [500, 0, 2, 31, 101], [1000, 0, 22, 12, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, 11, 1001], [0, 2, 21, 11, 1001], [500, 1, 2, 31, 101], [1000, 2, 22, 12, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, 11, 101], [0, 2, 21, 31, 1001], [500, 1, 2, 12, 101], [1000, 2, 22, 32, 1002], [1500, 1, 3, 13, 103], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 33, null]])"}, + -500); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic5Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestBasic5_" + std::get<1>(GetParam())); + BasicTest basic_test = GetBasicTest5Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetBasicTest6() { // Multi key, multiple batches, misaligned batches, zero tolerance - DoRunBasicTest(/*l*/ - {R"([[0, 1, 1.0], [0, 2, 21.0], [500, 1, 2.0], [1000, 2, 22.0], [1500, 1, 3.0], [1500, 2, 23.0]])", - R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, - /*r0*/ - {R"([[0, 1, 11.0], [500, 2, 31.0], [1000, 1, 12.0]])", - R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([[0, 1, 1.0, 11.0, null], [0, 2, 21.0, null, 1001.0], [500, 1, 2.0, null, 101.0], [1000, 2, 22.0, null, null], [1500, 1, 3.0, null, null], [1500, 2, 23.0, 32.0, 1002.0]])", - R"([[2000, 1, 4.0, 13.0, 103.0], [2000, 2, 24.0, null, null]])"}, - 0); -} - -TEST(AsofJoinTest, TestEmpty1) { + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, 11, 1001], [0, 0, 21, 11, 1001], [500, 0, 2, 31, 101], [1000, 0, 22, 12, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, 11, 1001], [0, 2, 21, 11, 1001], [500, 1, 2, 31, 101], [1000, 2, 22, 12, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, 11, null], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, null], [1500, 1, 3, null, null], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, null, null]])"}, + 0); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestBasic6, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestBasic6_" + std::get<1>(GetParam())); + BasicTest basic_test = GetBasicTest6(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest1() { // Empty left batch - DoRunBasicTest(/*l*/ - {R"([])", R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, - /*r0*/ - {R"([[0, 1, 11.0], [500, 2, 31.0], [1000, 1, 12.0]])", - R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([[2000, 1, 4.0, 13.0, 103.0], [2000, 2, 24.0, 32.0, 1002.0]])"}, - 1000); -} - -TEST(AsofJoinTest, TestEmpty2) { + return BasicTest(/*l*/ + {R"([])", R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 32, 1002]])"}, 1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty1, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty1_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest1(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest1Negative() { + // Empty left batch + return BasicTest(/*l*/ + {R"([])", R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 33, null]])"}, -1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty1Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty1Negative_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest1Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest2() { + // Empty left input + return BasicTest(/*l*/ + {R"([])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([])"}, + /*exp_emptykey*/ + {R"([])"}, + /*exp*/ + {R"([])"}, 1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty2, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty2_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest2(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest2Negative() { // Empty left input - DoRunBasicTest(/*l*/ - {R"([])"}, - /*r0*/ - {R"([[0, 1, 11.0], [500, 2, 31.0], [1000, 1, 12.0]])", - R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([])"}, 1000); -} - -TEST(AsofJoinTest, TestEmpty3) { + return BasicTest(/*l*/ + {R"([])"}, + /*r0*/ + {R"([[0, 1, 11], [500, 2, 31], [1000, 1, 12]])", + R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([])"}, + /*exp_emptykey*/ + {R"([])"}, + /*exp*/ + {R"([])"}, -1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty2Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty2Negative_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest2Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest3() { + // Empty right batch + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([])", R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, null, 1001], [0, 0, 21, null, 1001], [500, 0, 2, null, 101], [1000, 0, 22, null, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, null, 1001], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, null, null], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 1001], [1500, 1, 3, null, 102], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 32, 1002]])"}, + 1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty3, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty3_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest3(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest3Negative() { // Empty right batch - DoRunBasicTest(/*l*/ - {R"([[0, 1, 1.0], [0, 2, 21.0], [500, 1, 2.0], [1000, 2, 22.0], [1500, 1, 3.0], [1500, 2, 23.0]])", - R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, - /*r0*/ - {R"([])", R"([[1500, 2, 32.0], [2000, 1, 13.0], [2500, 2, 33.0]])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([[0, 1, 1.0, null, null], [0, 2, 21.0, null, 1001.0], [500, 1, 2.0, null, 101.0], [1000, 2, 22.0, null, 1001.0], [1500, 1, 3.0, null, 102.0], [1500, 2, 23.0, 32.0, 1002.0]])", - R"([[2000, 1, 4.0, 13.0, 103.0], [2000, 2, 24.0, 32.0, 1002.0]])"}, - 1000); -} - -TEST(AsofJoinTest, TestEmpty4) { + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([])", R"([[1500, 2, 32], [2000, 1, 13], [2500, 2, 33]])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, null, 1001], [0, 0, 21, null, 1001], [500, 0, 2, 32, 101], [1000, 0, 22, 32, 102], [1500, 0, 3, 32, 1002], [1500, 0, 23, 32, 1002]])", + R"([[2000, 0, 4, 13, 103], [2000, 0, 24, 13, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, null, 1001], [0, 2, 21, null, 1001], [500, 1, 2, 32, 101], [1000, 2, 22, 32, 102], [1500, 1, 3, 32, 1002], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 13, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, null, 101], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, 32, 1002], [1500, 1, 3, 13, 103], [1500, 2, 23, 32, 1002]])", + R"([[2000, 1, 4, 13, 103], [2000, 2, 24, 33, null]])"}, + -1000); +} + +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty3Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty3Negative_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest3Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest4() { // Empty right input - DoRunBasicTest(/*l*/ - {R"([[0, 1, 1.0], [0, 2, 21.0], [500, 1, 2.0], [1000, 2, 22.0], [1500, 1, 3.0], [1500, 2, 23.0]])", - R"([[2000, 1, 4.0], [2000, 2, 24.0]])"}, - /*r0*/ - {R"([])"}, - /*r1*/ - {R"([[0, 2, 1001.0], [500, 1, 101.0]])", - R"([[1000, 1, 102.0], [1500, 2, 1002.0], [2000, 1, 103.0]])"}, - /*exp*/ - {R"([[0, 1, 1.0, null, null], [0, 2, 21.0, null, 1001.0], [500, 1, 2.0, null, 101.0], [1000, 2, 22.0, null, 1001.0], [1500, 1, 3.0, null, 102.0], [1500, 2, 23.0, null, 1002.0]])", - R"([[2000, 1, 4.0, null, 103.0], [2000, 2, 24.0, null, 1002.0]])"}, - 1000); -} - -TEST(AsofJoinTest, TestEmpty5) { - // All empty - DoRunBasicTest(/*l*/ - {R"([])"}, - /*r0*/ - {R"([])"}, - /*r1*/ - {R"([])"}, - /*exp*/ - {R"([])"}, 1000); + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, null, 1001], [0, 0, 21, null, 1001], [500, 0, 2, null, 101], [1000, 0, 22, null, 102], [1500, 0, 3, null, 1002], [1500, 0, 23, null, 1002]])", + R"([[2000, 0, 4, null, 103], [2000, 0, 24, null, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, null, 1001], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 102], [1500, 1, 3, null, 1002], [1500, 2, 23, null, 1002]])", + R"([[2000, 1, 4, null, 103], [2000, 2, 24, null, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, null, null], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 1001], [1500, 1, 3, null, 102], [1500, 2, 23, null, 1002]])", + R"([[2000, 1, 4, null, 103], [2000, 2, 24, null, 1002]])"}, + 1000); } -TEST(AsofJoinTest, TestUnsupportedOntype) { - DoRunInvalidTypeTest( - schema({field("time", utf8()), field("key", int32()), field("l_v0", float64())}), - schema({field("time", utf8()), field("key", int32()), field("r0_v0", float32())})); +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty4, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty4_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest4(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest4Negative() { + // Empty right input + return BasicTest(/*l*/ + {R"([[0, 1, 1], [0, 2, 21], [500, 1, 2], [1000, 2, 22], [1500, 1, 3], [1500, 2, 23]])", + R"([[2000, 1, 4], [2000, 2, 24]])"}, + /*r0*/ + {R"([])"}, + /*r1*/ + {R"([[0, 2, 1001], [500, 1, 101]])", + R"([[1000, 1, 102], [1500, 2, 1002], [2000, 1, 103]])"}, + /*exp_nokey*/ + {R"([[0, 0, 1, null, 1001], [0, 0, 21, null, 1001], [500, 0, 2, null, 101], [1000, 0, 22, null, 102], [1500, 0, 3, null, 1002], [1500, 0, 23, null, 1002]])", + R"([[2000, 0, 4, null, 103], [2000, 0, 24, null, 103]])"}, + /*exp_emptykey*/ + {R"([[0, 1, 1, null, 1001], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 102], [1500, 1, 3, null, 1002], [1500, 2, 23, null, 1002]])", + R"([[2000, 1, 4, null, 103], [2000, 2, 24, null, 103]])"}, + /*exp*/ + {R"([[0, 1, 1, null, 101], [0, 2, 21, null, 1001], [500, 1, 2, null, 101], [1000, 2, 22, null, 1002], [1500, 1, 3, null, 103], [1500, 2, 23, null, 1002]])", + R"([[2000, 1, 4, null, 103], [2000, 2, 24, null, null]])"}, + -1000); } -TEST(AsofJoinTest, TestUnsupportedBytype) { - DoRunInvalidTypeTest( - schema({field("time", int64()), field("key", utf8()), field("l_v0", float64())}), - schema({field("time", int64()), field("key", utf8()), field("r0_v0", float32())})); +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty4Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty4Negative_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest4Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest5() { + // All empty + return BasicTest(/*l*/ + {R"([])"}, + /*r0*/ + {R"([])"}, + /*r1*/ + {R"([])"}, + /*exp_nokey*/ + {R"([])"}, + /*exp_emptykey*/ + {R"([])"}, + /*exp*/ + {R"([])"}, 1000); } -TEST(AsofJoinTest, TestUnsupportedDatatype) { - // Utf8 is unsupported - DoRunInvalidTypeTest( - schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), - schema({field("time", int64()), field("key", int32()), field("r0_v0", utf8())})); +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty5, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty5_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest5(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +BasicTest GetEmptyTest5Negative() { + // All empty + return BasicTest(/*l*/ + {R"([])"}, + /*r0*/ + {R"([])"}, + /*r1*/ + {R"([])"}, + /*exp_nokey*/ + {R"([])"}, + /*exp_emptykey*/ + {R"([])"}, + /*exp*/ + {R"([])"}, -1000); } -TEST(AsofJoinTest, TestMissingKeys) { +TRACED_TEST_P(AsofJoinBasicTest, TestEmpty5Negative, { + ARROW_SCOPED_TRACE("AsofJoinBasicTest_TestEmpty5Negative_" + std::get<1>(GetParam())); + BasicTest basic_test = GetEmptyTest5Negative(); + auto runner = std::get<0>(GetParam()); + runner(basic_test); +}) + +INSTANTIATE_TEST_SUITE_P( + AsofJoinNodeTest, AsofJoinBasicTest, + testing::Values(AsofJoinBasicParams(BasicTest::DoSingleByKey, "SingleByKey"), + AsofJoinBasicParams(BasicTest::DoDoubleByKey, "DoubleByKey"), + AsofJoinBasicParams(BasicTest::DoMutateByKey, "MutateByKey"), + AsofJoinBasicParams(BasicTest::DoMutateNoKey, "MutateNoKey"), + AsofJoinBasicParams(BasicTest::DoMutateNullKey, "MutateNullKey"), + AsofJoinBasicParams(BasicTest::DoMutateEmptyKey, "MutateEmptyKey"))); + +TRACED_TEST(AsofJoinTest, TestUnsupportedOntype, { + DoRunInvalidTypeTest(schema({field("time", list(int32())), field("key", int32()), + field("l_v0", float64())}), + schema({field("time", list(int32())), field("key", int32()), + field("r0_v0", float32())})); +}) + +TRACED_TEST(AsofJoinTest, TestUnsupportedBytype, { + DoRunInvalidTypeTest(schema({field("time", int64()), field("key", list(int32())), + field("l_v0", float64())}), + schema({field("time", int64()), field("key", list(int32())), + field("r0_v0", float32())})); +}) + +TRACED_TEST(AsofJoinTest, TestUnsupportedDatatype, { + // List is unsupported DoRunInvalidTypeTest( + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), + field("r0_v0", list(int32()))})); +}) + +TRACED_TEST(AsofJoinTest, TestMissingKeys, { + DoRunMissingKeysTest( schema({field("time1", int64()), field("key", int32()), field("l_v0", float64())}), schema( {field("time1", int64()), field("key", int32()), field("r0_v0", float64())})); - DoRunInvalidTypeTest( + DoRunMissingKeysTest( schema({field("time", int64()), field("key1", int32()), field("l_v0", float64())}), schema( {field("time", int64()), field("key1", int32()), field("r0_v0", float64())})); -} +}) + +TRACED_TEST(AsofJoinTest, TestMissingOnKey, { + DoRunMissingOnKeyTest( + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestMissingByKey, { + DoRunMissingByKeyTest( + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestNestedOnKey, { + DoRunNestedOnKeyTest( + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestNestedByKey, { + DoRunNestedByKeyTest( + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestAmbiguousOnKey, { + DoRunAmbiguousOnKeyTest( + schema({field("time", int64()), field("time", int64()), field("key", int32()), + field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestAmbiguousByKey, { + DoRunAmbiguousByKeyTest( + schema({field("time", int64()), field("key", int64()), field("key", int32()), + field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestLeftUnorderedOnKey, { + DoRunUnorderedPlanTest( + /*l_unordered=*/true, /*r_unordered=*/false, + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestRightUnorderedOnKey, { + DoRunUnorderedPlanTest( + /*l_unordered=*/false, /*r_unordered=*/true, + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) + +TRACED_TEST(AsofJoinTest, TestUnorderedOnKey, { + DoRunUnorderedPlanTest( + /*l_unordered=*/true, /*r_unordered=*/true, + schema({field("time", int64()), field("key", int32()), field("l_v0", float64())}), + schema({field("time", int64()), field("key", int32()), field("r0_v0", float64())})); +}) } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/benchmark_util.cc b/cpp/src/arrow/compute/exec/benchmark_util.cc index 5bac508854f..3c4dda2992a 100644 --- a/cpp/src/arrow/compute/exec/benchmark_util.cc +++ b/cpp/src/arrow/compute/exec/benchmark_util.cc @@ -24,6 +24,7 @@ #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/task_util.h" +#include "arrow/compute/exec/util.h" #include "arrow/util/macros.h" namespace arrow { @@ -34,7 +35,6 @@ namespace compute { // calling InputFinished and InputReceived. Status BenchmarkIsolatedNodeOverhead(benchmark::State& state, - arrow::compute::ExecContext ctx, arrow::compute::Expression expr, int32_t num_batches, int32_t batch_size, arrow::compute::BatchesWithSchema data, @@ -42,10 +42,10 @@ Status BenchmarkIsolatedNodeOverhead(benchmark::State& state, arrow::compute::ExecNodeOptions& options) { for (auto _ : state) { state.PauseTiming(); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - arrow::compute::ExecPlan::Make(&ctx)); + arrow::compute::ExecPlan::Make()); // Source and sink nodes have no effect on the benchmark. // Used for dummy purposes as they are referenced in InputReceived and InputFinished. ARROW_ASSIGN_OR_RAISE(arrow::compute::ExecNode * source_node, @@ -112,14 +112,14 @@ Status BenchmarkIsolatedNodeOverhead(benchmark::State& state, // a source -> node_declarations -> sink sequence. Status BenchmarkNodeOverhead( - benchmark::State& state, arrow::compute::ExecContext ctx, int32_t num_batches, - int32_t batch_size, arrow::compute::BatchesWithSchema data, + benchmark::State& state, int32_t num_batches, int32_t batch_size, + arrow::compute::BatchesWithSchema data, std::vector& node_declarations) { for (auto _ : state) { state.PauseTiming(); ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, - arrow::compute::ExecPlan::Make(&ctx)); - AsyncGenerator> sink_gen; + arrow::compute::ExecPlan::Make()); + AsyncGenerator> sink_gen; arrow::compute::Declaration source = arrow::compute::Declaration( {"source", arrow::compute::SourceNodeOptions{data.schema, diff --git a/cpp/src/arrow/compute/exec/benchmark_util.h b/cpp/src/arrow/compute/exec/benchmark_util.h index 7897288cb8f..c66c2e91dbf 100644 --- a/cpp/src/arrow/compute/exec/benchmark_util.h +++ b/cpp/src/arrow/compute/exec/benchmark_util.h @@ -29,13 +29,11 @@ namespace arrow { namespace compute { -Status BenchmarkNodeOverhead(benchmark::State& state, arrow::compute::ExecContext ctx, - int32_t num_batches, int32_t batch_size, - arrow::compute::BatchesWithSchema data, +Status BenchmarkNodeOverhead(benchmark::State& state, int32_t num_batches, + int32_t batch_size, arrow::compute::BatchesWithSchema data, std::vector& node_declarations); Status BenchmarkIsolatedNodeOverhead(benchmark::State& state, - arrow::compute::ExecContext ctx, arrow::compute::Expression expr, int32_t num_batches, int32_t batch_size, arrow::compute::BatchesWithSchema data, diff --git a/cpp/src/arrow/compute/exec/bloom_filter.h b/cpp/src/arrow/compute/exec/bloom_filter.h index 06920c6c14f..b0227e720d8 100644 --- a/cpp/src/arrow/compute/exec/bloom_filter.h +++ b/cpp/src/arrow/compute/exec/bloom_filter.h @@ -249,7 +249,7 @@ class ARROW_EXPORT BlockedBloomFilter { // b) It is preferred for small and medium size Bloom filters, because it skips extra // synchronization related steps from parallel variant (partitioning and taking locks). // -enum class ARROW_EXPORT BloomFilterBuildStrategy { +enum class BloomFilterBuildStrategy { SINGLE_THREADED = 0, PARALLEL = 1, }; diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc index 15d95690076..88cd298d2cb 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.cc +++ b/cpp/src/arrow/compute/exec/exec_plan.cc @@ -17,6 +17,8 @@ #include "arrow/compute/exec/exec_plan.h" +#include +#include #include #include #include @@ -24,30 +26,38 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/task_util.h" -#include "arrow/compute/exec_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/util/async_generator.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" -#include "arrow/util/optional.h" +#include "arrow/util/string.h" #include "arrow/util/tracing_internal.h" +#include "arrow/util/vector.h" namespace arrow { using internal::checked_cast; +using internal::ThreadPool; +using internal::ToChars; namespace compute { namespace { struct ExecPlanImpl : public ExecPlan { - explicit ExecPlanImpl(ExecContext* exec_context, - std::shared_ptr metadata = NULLPTR) - : ExecPlan(exec_context), metadata_(std::move(metadata)) {} + explicit ExecPlanImpl(QueryOptions options, ExecContext exec_context, + std::shared_ptr metadata = nullptr, + std::shared_ptr owned_thread_pool = nullptr) + : metadata_(std::move(metadata)), + query_context_(options, exec_context), + owned_thread_pool_(std::move(owned_thread_pool)) {} ~ExecPlanImpl() override { if (started_ && !finished_.is_finished()) { @@ -57,12 +67,9 @@ struct ExecPlanImpl : public ExecPlan { } } - size_t GetThreadIndex() { return thread_indexer_(); } - size_t max_concurrency() const { return thread_indexer_.Capacity(); } - ExecNode* AddNode(std::unique_ptr node) { if (node->label().empty()) { - node->SetLabel(std::to_string(auto_label_counter_++)); + node->SetLabel(ToChars(auto_label_counter_++)); } if (node->num_inputs() == 0) { sources_.push_back(node.get()); @@ -74,45 +81,6 @@ struct ExecPlanImpl : public ExecPlan { return nodes_.back().get(); } - Result> BeginExternalTask() { - Future<> completion_future = Future<>::Make(); - ARROW_ASSIGN_OR_RAISE(bool task_added, - task_group_.AddTaskIfNotEnded(completion_future)); - if (task_added) { - return std::move(completion_future); - } - // Return an invalid future if we were already finished to signal to the - // caller that they should not begin the task - return Future<>{}; - } - - Status ScheduleTask(std::function fn) { - auto executor = exec_context_->executor(); - if (!executor) return fn(); - // Adds a task which submits fn to the executor and tracks its progress. If we're - // already stopping then the task is ignored and fn is not executed. - return task_group_ - .AddTaskIfNotEnded([executor, fn]() { return executor->Submit(std::move(fn)); }) - .status(); - } - - Status ScheduleTask(std::function fn) { - std::function indexed_fn = [this, fn]() { - size_t thread_index = GetThreadIndex(); - return fn(thread_index); - }; - return ScheduleTask(std::move(indexed_fn)); - } - - int RegisterTaskGroup(std::function task, - std::function on_finished) { - return task_scheduler_->RegisterTaskGroup(std::move(task), std::move(on_finished)); - } - - Status StartTaskGroup(int task_group_id, int64_t num_tasks) { - return task_scheduler_->StartTaskGroup(GetThreadIndex(), task_group_id, num_tasks); - } - Status Validate() const { if (nodes_.empty()) { return Status::Invalid("ExecPlan has no node"); @@ -124,93 +92,117 @@ struct ExecPlanImpl : public ExecPlan { } Status StartProducing() { - START_COMPUTE_SPAN(span_, "ExecPlan", {{"plan", ToString()}}); -#ifdef ARROW_WITH_OPENTELEMETRY - if (HasMetadata()) { - auto pairs = metadata().get()->sorted_pairs(); - opentelemetry::nostd::shared_ptr span = - ::arrow::internal::tracing::UnwrapSpan(span_.details.get()); - std::for_each(std::begin(pairs), std::end(pairs), - [span](std::pair const& pair) { - span->SetAttribute(pair.first, pair.second); - }); - } -#endif if (started_) { return Status::Invalid("restarted ExecPlan"); } - - std::vector> futures; - for (auto& n : nodes_) { - RETURN_NOT_OK(n->Init()); - futures.push_back(n->finished()); + if (query_context_.exec_context()->executor() == nullptr) { + return Status::Invalid( + "An exec plan must have an executor for CPU tasks. To run without threads use " + "a SerialExeuctor (the arrow::compute::DeclarationTo... methods should take " + "care of this for you and are an easier way to execute an ExecPlan.)"); } - - AllFinished(futures).AddCallback([this](const Status& st) { - error_st_ = st; - EndTaskGroup(); - }); - - task_scheduler_->RegisterEnd(); - int num_threads = 1; - bool sync_execution = true; - if (auto executor = exec_context()->executor()) { - num_threads = executor->GetCapacity(); - sync_execution = false; + if (query_context_.io_context()->executor() == nullptr) { + return Status::Invalid("An exec plan must have an I/O executor for I/O tasks."); } - RETURN_NOT_OK(task_scheduler_->StartScheduling( - 0 /* thread_index */, - [this](std::function fn) -> Status { - return this->ScheduleTask(std::move(fn)); - }, - /*concurrent_tasks=*/2 * num_threads, sync_execution)); started_ = true; - // producers precede consumers - sorted_nodes_ = TopoSort(); - - Status st = Status::OK(); - using rev_it = std::reverse_iterator; - for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) { - auto node = *it; - - EVENT(span_, "StartProducing:" + node->label(), - {{"node.label", node->label()}, {"node.kind_name", node->kind_name()}}); - st = node->StartProducing(); - EVENT(span_, "StartProducing:" + node->label(), {{"status", st.ToString()}}); - if (!st.ok()) { - // Stop nodes that successfully started, in reverse order - stopped_ = true; - StopProducingImpl(it.base(), sorted_nodes_.end()); - for (NodeVector::iterator fw_it = sorted_nodes_.begin(); fw_it != it.base(); - ++fw_it) { - Future<> fut = (*fw_it)->finished(); - if (!fut.is_finished()) fut.MarkFinished(); - } - return st; - } - } - return st; - } - - void EndTaskGroup() { - bool expected = false; - if (group_ended_.compare_exchange_strong(expected, true)) { - task_group_.End().AddCallback([this](const Status& st) { - MARK_SPAN(span_, error_st_ & st); - END_SPAN(span_); - finished_.MarkFinished(error_st_ & st); - }); + // We call StartProducing on each of the nodes. The source nodes should generally + // start scheduling some tasks during this call. + // + // If no source node schedules any tasks (e.g. they do all their word synchronously as + // part of StartProducing) then the plan may be finished before we return from this + // call. + Future<> scheduler_finished = util::AsyncTaskScheduler::Make( + [this](util::AsyncTaskScheduler* async_scheduler) { + QueryContext* ctx = query_context(); + RETURN_NOT_OK(ctx->Init(ctx->max_concurrency(), async_scheduler)); + + START_COMPUTE_SPAN(span_, "ExecPlan", {{"plan", ToString()}}); +#ifdef ARROW_WITH_OPENTELEMETRY + if (HasMetadata()) { + auto pairs = metadata().get()->sorted_pairs(); + opentelemetry::nostd::shared_ptr span = + ::arrow::internal::tracing::UnwrapSpan(span_.details.get()); + std::for_each(std::begin(pairs), std::end(pairs), + [span](std::pair const& pair) { + span->SetAttribute(pair.first, pair.second); + }); + } +#endif + // TODO(weston) The entire concept of ExecNode::finished() will hopefully go + // away soon (or at least be replaced by a sub-scheduler to facilitate OT) + for (auto& n : nodes_) { + RETURN_NOT_OK(n->Init()); + } + for (auto& n : nodes_) { + async_scheduler->AddSimpleTask([&] { return n->finished(); }); + } + + ctx->scheduler()->RegisterEnd(); + int num_threads = 1; + bool sync_execution = true; + if (auto executor = query_context()->exec_context()->executor()) { + num_threads = executor->GetCapacity(); + sync_execution = false; + } + RETURN_NOT_OK(ctx->scheduler()->StartScheduling( + 0 /* thread_index */, + [ctx](std::function fn) -> Status { + return ctx->ScheduleTask(std::move(fn)); + }, + /*concurrent_tasks=*/2 * num_threads, sync_execution)); + + // producers precede consumers + sorted_nodes_ = TopoSort(); + + Status st = Status::OK(); + + using rev_it = std::reverse_iterator; + for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; + ++it) { + auto node = *it; + + EVENT(span_, "StartProducing:" + node->label(), + {{"node.label", node->label()}, {"node.kind_name", node->kind_name()}}); + st = node->StartProducing(); + EVENT(span_, "StartProducing:" + node->label(), {{"status", st.ToString()}}); + if (!st.ok()) { + // Stop nodes that successfully started, in reverse order + bool expected = false; + if (stopped_.compare_exchange_strong(expected, true)) { + StopProducingImpl(it.base(), sorted_nodes_.end()); + for (NodeVector::iterator fw_it = sorted_nodes_.begin(); + fw_it != it.base(); ++fw_it) { + Future<> fut = (*fw_it)->finished(); + if (!fut.is_finished()) fut.MarkFinished(); + } + } + return st; + } + } + return st; + }, + [this](const Status& st) { StopProducing(); }); + scheduler_finished.AddCallback( + [this](const Status& st) { finished_.MarkFinished(st); }); + // TODO(weston) Do we really need to return status here? Could we change this return + // to void? + if (finished_.is_finished()) { + return finished_.status(); + } else { + return Status::OK(); } } void StopProducing() { DCHECK(started_) << "stopped an ExecPlan which never started"; EVENT(span_, "StopProducing"); - stopped_ = true; - task_scheduler_->Abort( - [this]() { StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end()); }); + bool expected = false; + if (stopped_.compare_exchange_strong(expected, true)) { + query_context()->scheduler()->Abort( + [this]() { StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end()); }); + } } template @@ -318,18 +310,18 @@ struct ExecPlanImpl : public ExecPlan { Status error_st_; Future<> finished_ = Future<>::Make(); - bool started_ = false, stopped_ = false; + bool started_ = false; + std::atomic stopped_{false}; std::vector> nodes_; NodeVector sources_, sinks_; NodeVector sorted_nodes_; uint32_t auto_label_counter_ = 0; util::tracing::Span span_; std::shared_ptr metadata_; - - ThreadIndexer thread_indexer_; - std::atomic group_ended_{false}; - util::AsyncTaskGroup task_group_; - std::unique_ptr task_scheduler_ = TaskScheduler::Make(); + QueryContext query_context_; + // This field only exists for backwards compatibility. Remove once the deprecated + // ExecPlan::Make overloads have been removed. + std::shared_ptr owned_thread_pool_; }; ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast(ptr); } @@ -338,21 +330,49 @@ const ExecPlanImpl* ToDerived(const ExecPlan* ptr) { return checked_cast(ptr); } -util::optional GetNodeIndex(const std::vector& nodes, - const ExecNode* node) { +std::optional GetNodeIndex(const std::vector& nodes, + const ExecNode* node) { for (int i = 0; i < static_cast(nodes.size()); ++i) { if (nodes[i] == node) return i; } - return util::nullopt; + return std::nullopt; } } // namespace const uint32_t ExecPlan::kMaxBatchSize; +Result> ExecPlan::Make( + QueryOptions opts, ExecContext ctx, + std::shared_ptr metadata) { + return std::shared_ptr(new ExecPlanImpl{opts, ctx, std::move(metadata)}); +} + +Result> ExecPlan::Make( + ExecContext ctx, std::shared_ptr metadata) { + return Make(/*opts=*/{}, ctx, std::move(metadata)); +} + +// Deprecated and left for backwards compatibility. If the user does not supply a CPU +// executor then we will create a 1 thread pool and tie its lifetime to the plan +Result> ExecPlan::Make( + QueryOptions opts, ExecContext* ctx, + std::shared_ptr metadata) { + if (ctx->executor() == nullptr) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr tpool, ThreadPool::Make(1)); + ExecContext actual_ctx(ctx->memory_pool(), tpool.get(), ctx->func_registry()); + return std::shared_ptr( + new ExecPlanImpl{opts, actual_ctx, std::move(metadata), std::move(tpool)}); + } + return ExecPlan::Make(opts, *ctx, std::move(metadata)); +} + +// Deprecated Result> ExecPlan::Make( ExecContext* ctx, std::shared_ptr metadata) { - return std::shared_ptr(new ExecPlanImpl{ctx, metadata}); + ARROW_SUPPRESS_DEPRECATION_WARNING + return Make(/*opts=*/{}, ctx, std::move(metadata)); + ARROW_UNSUPPRESS_DEPRECATION_WARNING } ExecNode* ExecPlan::AddNode(std::unique_ptr node) { @@ -365,26 +385,7 @@ const ExecPlan::NodeVector& ExecPlan::sources() const { const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; } -size_t ExecPlan::GetThreadIndex() { return ToDerived(this)->GetThreadIndex(); } -size_t ExecPlan::max_concurrency() const { return ToDerived(this)->max_concurrency(); } - -Result> ExecPlan::BeginExternalTask() { - return ToDerived(this)->BeginExternalTask(); -} - -Status ExecPlan::ScheduleTask(std::function fn) { - return ToDerived(this)->ScheduleTask(std::move(fn)); -} -Status ExecPlan::ScheduleTask(std::function fn) { - return ToDerived(this)->ScheduleTask(std::move(fn)); -} -int ExecPlan::RegisterTaskGroup(std::function task, - std::function on_finished) { - return ToDerived(this)->RegisterTaskGroup(std::move(task), std::move(on_finished)); -} -Status ExecPlan::StartTaskGroup(int task_group_id, int64_t num_tasks) { - return ToDerived(this)->StartTaskGroup(task_group_id, num_tasks); -} +QueryContext* ExecPlan::query_context() { return &ToDerived(this)->query_context_; } Status ExecPlan::Validate() { return ToDerived(this)->Validate(); } @@ -469,103 +470,9 @@ bool ExecNode::ErrorIfNotOk(Status status) { return true; } -MapNode::MapNode(ExecPlan* plan, std::vector inputs, - std::shared_ptr output_schema, bool async_mode) - : ExecNode(plan, std::move(inputs), /*input_labels=*/{"target"}, - std::move(output_schema), - /*num_outputs=*/1) { - if (async_mode) { - executor_ = plan_->exec_context()->executor(); - } else { - executor_ = nullptr; - } -} - -void MapNode::ErrorReceived(ExecNode* input, Status error) { - DCHECK_EQ(input, inputs_[0]); - EVENT(span_, "ErrorReceived", {{"error.message", error.message()}}); - outputs_[0]->ErrorReceived(this, std::move(error)); -} - -void MapNode::InputFinished(ExecNode* input, int total_batches) { - DCHECK_EQ(input, inputs_[0]); - EVENT(span_, "InputFinished", {{"batches.length", total_batches}}); - outputs_[0]->InputFinished(this, total_batches); - if (input_counter_.SetTotal(total_batches)) { - this->Finish(); - } -} - -Status MapNode::StartProducing() { - START_COMPUTE_SPAN( - span_, std::string(kind_name()) + ":" + label(), - {{"node.label", label()}, {"node.detail", ToString()}, {"node.kind", kind_name()}}); - return Status::OK(); -} - -void MapNode::PauseProducing(ExecNode* output, int32_t counter) { - inputs_[0]->PauseProducing(this, counter); -} - -void MapNode::ResumeProducing(ExecNode* output, int32_t counter) { - inputs_[0]->ResumeProducing(this, counter); -} - -void MapNode::StopProducing(ExecNode* output) { - DCHECK_EQ(output, outputs_[0]); - StopProducing(); -} - -void MapNode::StopProducing() { - EVENT(span_, "StopProducing"); - if (executor_) { - this->stop_source_.RequestStop(); - } - if (input_counter_.Cancel()) { - this->Finish(); - } - inputs_[0]->StopProducing(this); -} - -void MapNode::SubmitTask(std::function(ExecBatch)> map_fn, - ExecBatch batch) { - Status status; - // This will be true if the node is stopped early due to an error or manual - // cancellation - if (input_counter_.Completed()) { - return; - } - auto task = [this, map_fn, batch]() { - auto guarantee = batch.guarantee; - auto output_batch = map_fn(std::move(batch)); - if (ErrorIfNotOk(output_batch.status())) { - return output_batch.status(); - } - output_batch->guarantee = guarantee; - outputs_[0]->InputReceived(this, output_batch.MoveValueUnsafe()); - return Status::OK(); - }; - - status = task(); - if (!status.ok()) { - if (input_counter_.Cancel()) { - this->Finish(status); - } - inputs_[0]->StopProducing(this); - return; - } - if (input_counter_.Increment()) { - this->Finish(); - } -} - -void MapNode::Finish(Status finish_st /*= Status::OK()*/) { - this->finished_.MarkFinished(finish_st); -} - std::shared_ptr MakeGeneratorReader( - std::shared_ptr schema, - std::function>()> gen, MemoryPool* pool) { + std::shared_ptr schema, std::function>()> gen, + MemoryPool* pool) { struct Impl : RecordBatchReader { std::shared_ptr schema() const override { return schema_; } @@ -583,7 +490,7 @@ std::shared_ptr MakeGeneratorReader( // reading from generator until end is reached. std::shared_ptr batch; RETURN_NOT_OK(ReadNext(&batch)); - while (batch != NULLPTR) { + while (batch != nullptr) { RETURN_NOT_OK(ReadNext(&batch)); } return Status::OK(); @@ -591,7 +498,7 @@ std::shared_ptr MakeGeneratorReader( MemoryPool* pool_; std::shared_ptr schema_; - Iterator> iterator_; + Iterator> iterator_; }; auto out = std::make_shared(); @@ -607,12 +514,12 @@ Result Declaration::AddToPlan(ExecPlan* plan, size_t i = 0; for (const Input& input : this->inputs) { - if (auto node = util::get_if(&input)) { + if (auto node = std::get_if(&input)) { inputs[i++] = *node; continue; } ARROW_ASSIGN_OR_RAISE(inputs[i++], - util::get(input).AddToPlan(plan, registry)); + std::get(input).AddToPlan(plan, registry)); } ARROW_ASSIGN_OR_RAISE( @@ -633,11 +540,277 @@ Declaration Declaration::Sequence(std::vector decls) { decls.pop_back(); receiver->inputs.emplace_back(std::move(input)); - receiver = &util::get(receiver->inputs.front()); + receiver = &std::get(receiver->inputs.front()); } return out; } +bool Declaration::IsValid(ExecFactoryRegistry* registry) const { + return !this->factory_name.empty() && this->options != nullptr; +} + +Future> DeclarationToTableAsync(Declaration declaration, + ExecContext exec_context) { + std::shared_ptr> output_table = + std::make_shared>(); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr exec_plan, + ExecPlan::Make(exec_context)); + Declaration with_sink = Declaration::Sequence( + {declaration, {"table_sink", TableSinkNodeOptions(output_table.get())}}); + ARROW_RETURN_NOT_OK(with_sink.AddToPlan(exec_plan.get())); + ARROW_RETURN_NOT_OK(exec_plan->Validate()); + ARROW_RETURN_NOT_OK(exec_plan->StartProducing()); + return exec_plan->finished().Then([exec_plan, output_table] { return *output_table; }); +} + +Future> DeclarationToTableAsync( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + if (use_threads) { + ExecContext ctx(memory_pool, ::arrow::internal::GetCpuThreadPool(), + function_registry); + return DeclarationToTableAsync(std::move(declaration), ctx); + } else { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr tpool, ThreadPool::Make(1)); + ExecContext ctx(memory_pool, tpool.get(), function_registry); + return DeclarationToTableAsync(std::move(declaration), ctx) + .Then([tpool](const std::shared_ptr
& table) { return table; }); + } +} + +Result> DeclarationToTable(Declaration declaration, + bool use_threads, + MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + return ::arrow::internal::RunSynchronously>>( + [=, declaration = std::move(declaration)](::arrow::internal::Executor* executor) { + ExecContext ctx(memory_pool, executor, function_registry); + return DeclarationToTableAsync(std::move(declaration), ctx); + }, + use_threads); +} + +Future>> DeclarationToBatchesAsync( + Declaration declaration, ExecContext exec_context) { + return DeclarationToTableAsync(std::move(declaration), exec_context) + .Then([](const std::shared_ptr
& table) { + return TableBatchReader(table).ToRecordBatches(); + }); +} + +Future>> DeclarationToBatchesAsync( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + if (use_threads) { + ExecContext ctx(memory_pool, ::arrow::internal::GetCpuThreadPool(), + function_registry); + return DeclarationToBatchesAsync(std::move(declaration), ctx); + } else { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr tpool, ThreadPool::Make(1)); + ExecContext ctx(memory_pool, tpool.get(), function_registry); + return DeclarationToBatchesAsync(std::move(declaration), ctx) + .Then([tpool](const std::vector>& batches) { + return batches; + }); + } +} + +Result>> DeclarationToBatches( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + return ::arrow::internal::RunSynchronously< + Future>>>( + [=, declaration = std::move(declaration)](::arrow::internal::Executor* executor) { + ExecContext ctx(memory_pool, executor, function_registry); + return DeclarationToBatchesAsync(std::move(declaration), ctx); + }, + use_threads); +} + +Future DeclarationToExecBatchesAsync(Declaration declaration, + ExecContext exec_context) { + std::shared_ptr out_schema; + AsyncGenerator> sink_gen; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr exec_plan, + ExecPlan::Make(exec_context)); + Declaration with_sink = Declaration::Sequence( + {declaration, {"sink", SinkNodeOptions(&sink_gen, &out_schema)}}); + ARROW_RETURN_NOT_OK(with_sink.AddToPlan(exec_plan.get())); + ARROW_RETURN_NOT_OK(exec_plan->Validate()); + ARROW_RETURN_NOT_OK(exec_plan->StartProducing()); + auto collected_fut = CollectAsyncGenerator(sink_gen); + return AllFinished({exec_plan->finished(), Future<>(collected_fut)}) + .Then([collected_fut, exec_plan, + schema = std::move(out_schema)]() -> Result { + ARROW_ASSIGN_OR_RAISE(auto collected, collected_fut.result()); + std::vector exec_batches = ::arrow::internal::MapVector( + [](std::optional batch) { return batch.value_or(ExecBatch()); }, + std::move(collected)); + return BatchesWithCommonSchema{std::move(exec_batches), schema}; + }); +} + +Future DeclarationToExecBatchesAsync( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + if (use_threads) { + ExecContext ctx(memory_pool, ::arrow::internal::GetCpuThreadPool(), + function_registry); + return DeclarationToExecBatchesAsync(std::move(declaration), ctx); + } else { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr tpool, ThreadPool::Make(1)); + ExecContext ctx(memory_pool, tpool.get(), function_registry); + return DeclarationToExecBatchesAsync(std::move(declaration), ctx) + .Then([tpool](const BatchesWithCommonSchema& batches) { return batches; }); + } +} + +Result DeclarationToExecBatches( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + return ::arrow::internal::RunSynchronously>( + [=, declaration = std::move(declaration)](::arrow::internal::Executor* executor) { + ExecContext ctx(memory_pool, executor, function_registry); + return DeclarationToExecBatchesAsync(std::move(declaration), ctx); + }, + use_threads); +} + +Future<> DeclarationToStatusAsync(Declaration declaration, ExecContext exec_context) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr exec_plan, + ExecPlan::Make(exec_context)); + ARROW_ASSIGN_OR_RAISE(ExecNode * last_node, declaration.AddToPlan(exec_plan.get())); + for (int i = 0; i < last_node->num_outputs(); i++) { + ARROW_RETURN_NOT_OK( + Declaration("consuming_sink", {last_node}, + ConsumingSinkNodeOptions(NullSinkNodeConsumer::Make())) + .AddToPlan(exec_plan.get())); + } + ARROW_RETURN_NOT_OK(exec_plan->Validate()); + ARROW_RETURN_NOT_OK(exec_plan->StartProducing()); + // Keep the exec_plan alive until it finishes + return exec_plan->finished().Then([exec_plan]() {}); +} + +Future<> DeclarationToStatusAsync(Declaration declaration, bool use_threads, + MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + if (use_threads) { + ExecContext ctx(memory_pool, ::arrow::internal::GetCpuThreadPool(), + function_registry); + return DeclarationToStatusAsync(std::move(declaration), ctx); + } else { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr tpool, ThreadPool::Make(1)); + ExecContext ctx(memory_pool, tpool.get(), function_registry); + return DeclarationToStatusAsync(std::move(declaration), ctx).Then([tpool]() {}); + } +} + +Status DeclarationToStatus(Declaration declaration, bool use_threads, + MemoryPool* memory_pool, FunctionRegistry* function_registry) { + return ::arrow::internal::RunSynchronously>( + [=, declaration = std::move(declaration)](::arrow::internal::Executor* executor) { + ExecContext ctx(memory_pool, executor, function_registry); + return DeclarationToStatusAsync(std::move(declaration), ctx); + }, + use_threads); +} + +namespace { +struct BatchConverter { + ~BatchConverter() { + if (!exec_plan) { + return; + } + if (exec_plan->finished().is_finished()) { + return; + } + exec_plan->StopProducing(); + Status abandoned_status = exec_plan->finished().status(); + if (!abandoned_status.ok()) { + abandoned_status.Warn(); + } + } + + Future> operator()() { + return exec_batch_gen().Then( + [this](const std::optional& batch) + -> Future> { + if (batch) { + return batch->ToRecordBatch(schema); + } else { + return exec_plan->finished().Then( + []() -> std::shared_ptr { return nullptr; }); + } + }, + [this](const Status& err) { + return exec_plan->finished().Then( + [err]() -> Result> { return err; }); + }); + } + + AsyncGenerator> exec_batch_gen; + std::shared_ptr schema; + std::shared_ptr exec_plan; +}; + +Result>> DeclarationToRecordBatchGenerator( + Declaration declaration, ExecContext exec_ctx, std::shared_ptr* out_schema) { + auto converter = std::make_shared(); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(exec_ctx)); + Declaration with_sink = Declaration::Sequence( + {declaration, + {"sink", SinkNodeOptions(&converter->exec_batch_gen, &converter->schema)}}); + ARROW_RETURN_NOT_OK(with_sink.AddToPlan(plan.get())); + ARROW_RETURN_NOT_OK(plan->StartProducing()); + converter->exec_plan = std::move(plan); + *out_schema = converter->schema; + return [conv = std::move(converter)] { return (*conv)(); }; +} +} // namespace + +Result> DeclarationToReader( + Declaration declaration, bool use_threads, MemoryPool* memory_pool, + FunctionRegistry* function_registry) { + std::shared_ptr schema; + auto batch_iterator = std::make_unique>>( + ::arrow::internal::IterateSynchronously>( + [&](::arrow::internal::Executor* executor) + -> Result>> { + ExecContext exec_ctx(memory_pool, executor, function_registry); + return DeclarationToRecordBatchGenerator(declaration, exec_ctx, &schema); + }, + use_threads)); + + struct PlanReader : RecordBatchReader { + PlanReader(std::shared_ptr schema, + std::unique_ptr>> iterator) + : schema_(std::move(schema)), iterator_(std::move(iterator)) {} + + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* record_batch) override { + DCHECK(!!iterator_) << "call to ReadNext on already closed reader"; + return iterator_->Next().Value(record_batch); + } + + Status Close() override { + // End plan and read from generator until finished + std::shared_ptr batch; + do { + ARROW_RETURN_NOT_OK(ReadNext(&batch)); + } while (batch != nullptr); + iterator_.reset(); + return Status::OK(); + } + + std::shared_ptr schema_; + std::unique_ptr>> iterator_; + }; + + return std::make_unique(std::move(schema), std::move(batch_iterator)); +} + namespace internal { void RegisterSourceNode(ExecFactoryRegistry*); @@ -694,12 +867,12 @@ ExecFactoryRegistry* default_exec_factory_registry() { return &instance; } -Result>()>> MakeReaderGenerator( +Result>()>> MakeReaderGenerator( std::shared_ptr reader, ::arrow::internal::Executor* io_executor, int max_q, int q_restart) { auto batch_it = MakeMapIterator( [](std::shared_ptr batch) { - return util::make_optional(ExecBatch(*batch)); + return std::make_optional(ExecBatch(*batch)); }, MakeIteratorFromReader(reader)); diff --git a/cpp/src/arrow/compute/exec/exec_plan.h b/cpp/src/arrow/compute/exec/exec_plan.h index 5e52f606a69..a1a89158c54 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.h +++ b/cpp/src/arrow/compute/exec/exec_plan.h @@ -17,27 +17,31 @@ #pragma once +#include +#include #include #include +#include #include +#include #include #include "arrow/compute/exec.h" -#include "arrow/compute/exec/util.h" #include "arrow/compute/type_fwd.h" #include "arrow/type_fwd.h" -#include "arrow/util/async_util.h" -#include "arrow/util/cancel.h" -#include "arrow/util/key_value_metadata.h" +#include "arrow/util/future.h" #include "arrow/util/macros.h" -#include "arrow/util/optional.h" #include "arrow/util/tracing.h" +#include "arrow/util/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { namespace compute { +/// \addtogroup execnode-components +/// @{ + class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this { public: // This allows operators to rely on signed 16-bit indices @@ -46,11 +50,23 @@ class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this { virtual ~ExecPlan() = default; - ExecContext* exec_context() const { return exec_context_; } + QueryContext* query_context(); /// Make an empty exec plan static Result> Make( - ExecContext* = default_exec_context(), + QueryOptions options, ExecContext exec_context = *threaded_exec_context(), + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + ExecContext exec_context = *threaded_exec_context(), + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + QueryOptions options, ExecContext* exec_context, + std::shared_ptr metadata = NULLPTR); + + static Result> Make( + ExecContext* exec_context, std::shared_ptr metadata = NULLPTR); ExecNode* AddNode(std::unique_ptr node); @@ -63,60 +79,6 @@ class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this { return out; } - /// \brief Returns the index of the current thread. - size_t GetThreadIndex(); - /// \brief Returns the maximum number of threads that the plan could use. - /// - /// GetThreadIndex will always return something less than this, so it is safe to - /// e.g. make an array of thread-locals off this. - size_t max_concurrency() const; - - /// \brief Start an external task - /// - /// This should be avoided if possible. It is kept in for now for legacy - /// purposes. This should be called before the external task is started. If - /// a valid future is returned then it should be marked complete when the - /// external task has finished. - /// - /// \return an invalid future if the plan has already ended, otherwise this - /// returns a future that must be completed when the external task - /// finishes. - Result> BeginExternalTask(); - - /// \brief Add a single function as a task to the plan's task group. - /// - /// \param fn The task to run. Takes no arguments and returns a Status. - Status ScheduleTask(std::function fn); - - /// \brief Add a single function as a task to the plan's task group. - /// - /// \param fn The task to run. Takes the thread index and returns a Status. - Status ScheduleTask(std::function fn); - // Register/Start TaskGroup is a way of performing a "Parallel For" pattern: - // - The task function takes the thread index and the index of the task - // - The on_finished function takes the thread index - // Returns an integer ID that will be used to reference the task group in - // StartTaskGroup. At runtime, call StartTaskGroup with the ID and the number of times - // you'd like the task to be executed. The need to register a task group before use will - // be removed after we rewrite the scheduler. - /// \brief Register a "parallel for" task group with the scheduler - /// - /// \param task The function implementing the task. Takes the thread_index and - /// the task index. - /// \param on_finished The function that gets run once all tasks have been completed. - /// Takes the thread_index. - /// - /// Must be called inside of ExecNode::Init. - int RegisterTaskGroup(std::function task, - std::function on_finished); - - /// \brief Start the task group with the specified ID. This can only - /// be called once per task_group_id. - /// - /// \param task_group_id The ID of the task group to run - /// \param num_tasks The number of times to run the task - Status StartTaskGroup(int task_group_id, int64_t num_tasks); - /// The initial inputs const NodeVector& sources() const; @@ -146,25 +108,7 @@ class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this { /// \brief Return the plan's attached metadata std::shared_ptr metadata() const; - /// \brief Should the plan use a legacy batching strategy - /// - /// This is currently in place only to support the Scanner::ToTable - /// method. This method relies on batch indices from the scanner - /// remaining consistent. This is impractical in the ExecPlan which - /// might slice batches as needed (e.g. for a join) - /// - /// However, it still works for simple plans and this is the only way - /// we have at the moment for maintaining implicit order. - bool UseLegacyBatching() const { return use_legacy_batching_; } - // For internal use only, see above comment - void SetUseLegacyBatching(bool value) { use_legacy_batching_ = value; } - std::string ToString() const; - - protected: - ExecContext* exec_context_; - bool use_legacy_batching_ = false; - explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {} }; class ARROW_EXPORT ExecNode { @@ -364,48 +308,6 @@ class ARROW_EXPORT ExecNode { util::tracing::Span span_; }; -/// \brief MapNode is an ExecNode type class which process a task like filter/project -/// (See SubmitTask method) to each given ExecBatch object, which have one input, one -/// output, and are pure functions on the input -/// -/// A simple parallel runner is created with a "map_fn" which is just a function that -/// takes a batch in and returns a batch. This simple parallel runner also needs an -/// executor (use simple synchronous runner if there is no executor) - -class ARROW_EXPORT MapNode : public ExecNode { - public: - MapNode(ExecPlan* plan, std::vector inputs, - std::shared_ptr output_schema, bool async_mode); - - void ErrorReceived(ExecNode* input, Status error) override; - - void InputFinished(ExecNode* input, int total_batches) override; - - Status StartProducing() override; - - void PauseProducing(ExecNode* output, int32_t counter) override; - - void ResumeProducing(ExecNode* output, int32_t counter) override; - - void StopProducing(ExecNode* output) override; - - void StopProducing() override; - - protected: - void SubmitTask(std::function(ExecBatch)> map_fn, ExecBatch batch); - - virtual void Finish(Status finish_st = Status::OK()); - - protected: - // Counter for the number of batches received - AtomicCounter input_counter_; - - ::arrow::internal::Executor* executor_; - - // Variable used to cancel remaining tasks in the executor - StopSource stop_source_; -}; - /// \brief An extensible registry for factories of ExecNodes class ARROW_EXPORT ExecFactoryRegistry { public: @@ -444,7 +346,9 @@ inline Result MakeExecNode( /// inputs may also be Declarations). The node can be constructed and added to a plan /// with Declaration::AddToPlan, which will recursively construct any inputs as necessary. struct ARROW_EXPORT Declaration { - using Input = util::Variant; + using Input = std::variant; + + Declaration() {} Declaration(std::string factory_name, std::vector inputs, std::shared_ptr options, std::string label) @@ -509,18 +413,165 @@ struct ARROW_EXPORT Declaration { Result AddToPlan(ExecPlan* plan, ExecFactoryRegistry* registry = default_exec_factory_registry()) const; + // Validate a declaration + bool IsValid(ExecFactoryRegistry* registry = default_exec_factory_registry()) const; + std::string factory_name; std::vector inputs; std::shared_ptr options; std::string label; }; +/// \brief Utility method to run a declaration and collect the results into a table +/// +/// \param declaration A declaration describing the plan to run +/// \param use_threads If `use_threads` is false then all CPU work will be done on the +/// calling thread. I/O tasks will still happen on the I/O executor +/// and may be multi-threaded (but should not use significant CPU +/// resources). +/// \param memory_pool The memory pool to use for allocations made while running the plan. +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +/// +/// This method will add a sink node to the declaration to collect results into a +/// table. It will then create an ExecPlan from the declaration, start the exec plan, +/// block until the plan has finished, and return the created table. +ARROW_EXPORT Result> DeclarationToTable( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Asynchronous version of \see DeclarationToTable +/// +/// \param declaration A declaration describing the plan to run +/// \param use_threads The behavior of use_threads is slightly different than the +/// synchronous version since we cannot run synchronously on the +/// calling thread. Instead, if use_threads=false then a new thread +/// pool will be created with a single thread and this will be used for +/// all compute work. +/// \param memory_pool The memory pool to use for allocations made while running the plan. +/// \param function_registry The function registry to use for function execution. If null +/// then the default function registry will be used. +ARROW_EXPORT Future> DeclarationToTableAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToTableAsync accepting a custom exec context +/// +/// The executor must be specified (cannot be null) and must be kept alive until the +/// returned future finishes. +ARROW_EXPORT Future> DeclarationToTableAsync( + Declaration declaration, ExecContext custom_exec_context); + +/// \brief a collection of exec batches with a common schema +struct BatchesWithCommonSchema { + std::vector batches; + std::shared_ptr schema; +}; + +/// \brief Utility method to run a declaration and collect the results into ExecBatch +/// vector +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_EXPORT Result DeclarationToExecBatches( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Asynchronous version of \see DeclarationToExecBatches +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future DeclarationToExecBatchesAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToExecBatchesAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future DeclarationToExecBatchesAsync( + Declaration declaration, ExecContext custom_exec_context); + +/// \brief Utility method to run a declaration and collect the results into a vector +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_EXPORT Result>> DeclarationToBatches( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Asynchronous version of \see DeclarationToBatches +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future>> DeclarationToBatchesAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToBatchesAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future>> DeclarationToBatchesAsync( + Declaration declaration, ExecContext exec_context); + +/// \brief Utility method to run a declaration and return results as a RecordBatchReader +/// +/// If an exec context is not provided then a default exec context will be used based +/// on the value of `use_threads`. If `use_threads` is false then the CPU exeuctor will +/// be a serial executor and all CPU work will be done on the calling thread. I/O tasks +/// will still happen on the I/O executor and may be multi-threaded. +/// +/// If `use_threads` is false then all CPU work will happen during the calls to +/// RecordBatchReader::Next and no CPU work will happen in the background. If +/// `use_threads` is true then CPU work will happen on the CPU thread pool and tasks may +/// run in between calls to RecordBatchReader::Next. If the returned reader is not +/// consumed quickly enough then the plan will eventually pause as the backpressure queue +/// fills up. +/// +/// If a custom exec context is provided then the value of `use_threads` will be ignored. +ARROW_EXPORT Result> DeclarationToReader( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToReader accepting a custom exec context +ARROW_EXPORT Result> DeclarationToReader( + Declaration declaration, ExecContext exec_context); + +/// \brief Utility method to run a declaration and ignore results +/// +/// This can be useful when the data are consumed as part of the plan itself, for +/// example, when the plan ends with a write node. +/// +/// \see DeclarationToTable for details on threading & execution +ARROW_EXPORT Status DeclarationToStatus(Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Asynchronous version of \see DeclarationToStatus +/// +/// This can be useful when the data are consumed as part of the plan itself, for +/// example, when the plan ends with a write node. +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future<> DeclarationToStatusAsync( + Declaration declaration, bool use_threads = true, + MemoryPool* memory_pool = default_memory_pool(), + FunctionRegistry* function_registry = NULLPTR); + +/// \brief Overload of \see DeclarationToStatusAsync accepting a custom exec context +/// +/// \see DeclarationToTableAsync for details on threading & execution +ARROW_EXPORT Future<> DeclarationToStatusAsync(Declaration declaration, + ExecContext exec_context); + /// \brief Wrap an ExecBatch generator in a RecordBatchReader. /// /// The RecordBatchReader does not impose any ordering on emitted batches. ARROW_EXPORT std::shared_ptr MakeGeneratorReader( - std::shared_ptr, std::function>()>, + std::shared_ptr, std::function>()>, MemoryPool*); constexpr int kDefaultBackgroundMaxQ = 32; @@ -530,9 +581,11 @@ constexpr int kDefaultBackgroundQRestart = 16; /// /// Useful as a source node for an Exec plan ARROW_EXPORT -Result>()>> MakeReaderGenerator( +Result>()>> MakeReaderGenerator( std::shared_ptr reader, arrow::internal::Executor* io_executor, int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart); +/// @} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 06f36c7f5ad..0dd1a0b9a90 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -17,12 +17,15 @@ #include "arrow/compute/exec/expression.h" +#include +#include #include #include #include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/exec/expression_internal.h" +#include "arrow/compute/exec/util.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function_internal.h" #include "arrow/io/memory.h" @@ -31,7 +34,6 @@ #include "arrow/util/hash_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" -#include "arrow/util/optional.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/util/vector.h" @@ -40,6 +42,8 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; +using internal::EndsWith; +using internal::ToChars; namespace compute { @@ -76,10 +80,16 @@ Expression call(std::string function, std::vector arguments, return Expression(std::move(call)); } -const Datum* Expression::literal() const { return util::get_if(impl_.get()); } +const Datum* Expression::literal() const { + if (impl_ == nullptr) return nullptr; + + return std::get_if(impl_.get()); +} const Expression::Parameter* Expression::parameter() const { - return util::get_if(impl_.get()); + if (impl_ == nullptr) return nullptr; + + return std::get_if(impl_.get()); } const FieldRef* Expression::field_ref() const { @@ -90,7 +100,9 @@ const FieldRef* Expression::field_ref() const { } const Expression::Call* Expression::call() const { - return util::get_if(impl_.get()); + if (impl_ == nullptr) return nullptr; + + return std::get_if(impl_.get()); } const DataType* Expression::type() const { @@ -117,8 +129,7 @@ std::string PrintDatum(const Datum& datum) { case Type::STRING: case Type::LARGE_STRING: return '"' + - Escape(util::string_view(*datum.scalar_as().value)) + - '"'; + Escape(std::string_view(*datum.scalar_as().value)) + '"'; case Type::BINARY: case Type::FIXED_SIZE_BINARY: @@ -163,8 +174,8 @@ std::string Expression::ToString() const { return binary(Comparison::GetOp(*cmp)); } - constexpr util::string_view kleene = "_kleene"; - if (util::string_view{call->function_name}.ends_with(kleene)) { + constexpr std::string_view kleene = "_kleene"; + if (EndsWith(call->function_name, kleene)) { auto op = call->function_name.substr(0, call->function_name.size() - kleene.size()); return binary(std::move(op)); } @@ -187,11 +198,11 @@ std::string Expression::ToString() const { if (call->options) { out += call->options->ToString(); - out.resize(out.size() + 1); - } else { - out.resize(out.size() - 1); + } else if (call->arguments.size()) { + out.resize(out.size() - 2); } - out.back() = ')'; + + out += ')'; return out; } @@ -309,13 +320,12 @@ bool Expression::IsNullLiteral() const { } namespace { -util::optional GetNullHandling( - const Expression::Call& call) { +std::optional GetNullHandling(const Expression::Call& call) { DCHECK_NE(call.function, nullptr); if (call.function->kind() == compute::Function::SCALAR) { return static_cast(call.kernel)->null_handling; } - return util::nullopt; + return std::nullopt; } } // namespace @@ -614,18 +624,6 @@ ArgumentsAndFlippedArguments(const Expression::Call& call) { call.arguments[0]}}; } -template ::value_type> -util::optional FoldLeft(It begin, It end, const BinOp& bin_op) { - if (begin == end) return util::nullopt; - - Out folded = std::move(*begin++); - while (begin != end) { - folded = bin_op(std::move(folded), std::move(*begin++)); - } - return folded; -} - } // namespace std::vector FieldsInExpression(const Expression& expr) { @@ -655,7 +653,11 @@ bool ExpressionHasFieldRefs(const Expression& expr) { } Result FoldConstants(Expression expr) { - return Modify( + if (!expr.IsBound()) { + return Status::Invalid("Cannot fold constants in unbound expression."); + } + + return ModifyExpression( std::move(expr), [](Expression expr) { return expr; }, [](Expression expr, ...) -> Result { auto call = CallNotNull(expr); @@ -738,18 +740,18 @@ std::vector GuaranteeConjunctionMembers( /// Recognizes expressions of the form: /// equal(a, 2) /// is_null(a) -util::optional> ExtractOneFieldValue( +std::optional> ExtractOneFieldValue( const Expression& guarantee) { auto call = guarantee.call(); - if (!call) return util::nullopt; + if (!call) return std::nullopt; // search for an equality conditions between a field and a literal if (call->function_name == "equal") { auto ref = call->arguments[0].field_ref(); - if (!ref) return util::nullopt; + if (!ref) return std::nullopt; auto lit = call->arguments[1].literal(); - if (!lit) return util::nullopt; + if (!lit) return std::nullopt; return std::make_pair(*ref, *lit); } @@ -757,12 +759,12 @@ util::optional> ExtractOneFieldValue( // ... or a known null field if (call->function_name == "is_null") { auto ref = call->arguments[0].field_ref(); - if (!ref) return util::nullopt; + if (!ref) return std::nullopt; return std::make_pair(*ref, Datum(std::make_shared())); } - return util::nullopt; + return std::nullopt; } // Conjunction members which are represented in known_values are erased from @@ -800,7 +802,7 @@ Result ReplaceFieldsWithKnownValues(const KnownFieldValues& known_va "ReplaceFieldsWithKnownValues called on an unbound Expression"); } - return Modify( + return ModifyExpression( std::move(expr), [&known_values](Expression expr) -> Result { if (auto ref = expr.field_ref()) { @@ -848,9 +850,34 @@ bool IsBinaryAssociativeCommutative(const Expression::Call& call) { return it != binary_associative_commutative.end(); } +Result HandleInconsistentTypes(Expression::Call call, + compute::ExecContext* exec_context) { + // ARROW-18334: due to reordering of arguments, the call may have + // inconsistent argument types. For example, the call's kernel may + // correspond to `timestamp + duration` but the arguments happen to + // be `duration, timestamp`. The addition itself is still commutative, + // but the mismatch in declared argument types is potentially problematic + // if we ever start using the Expression::Call::kernel field more than + // we do currently. Check and rebind if necessary. + // + // The more correct fix for this problem is to ensure that all kernels of + // functions which are commutative be commutative as well, which would + // obviate rebinding like this. In the context of ARROW-18334, this + // would require rewriting KernelSignature so that a single kernel can + // handle both `timestamp + duration` and `duration + timestamp`. + if (call.kernel->signature->MatchesInputs(GetTypes(call.arguments))) { + return Expression(std::move(call)); + } + return BindNonRecursive(std::move(call), /*insert_implicit_casts=*/false, exec_context); +} + } // namespace Result Canonicalize(Expression expr, compute::ExecContext* exec_context) { + if (!expr.IsBound()) { + return Status::Invalid("Cannot canonicalize an unbound expression."); + } + if (exec_context == nullptr) { compute::ExecContext exec_context; return Canonicalize(std::move(expr), &exec_context); @@ -871,7 +898,7 @@ Result Canonicalize(Expression expr, compute::ExecContext* exec_cont } } AlreadyCanonicalized; - return Modify( + return ModifyExpression( std::move(expr), [&AlreadyCanonicalized, exec_context](Expression expr) -> Result { auto call = expr.call(); @@ -893,9 +920,12 @@ Result Canonicalize(Expression expr, compute::ExecContext* exec_cont } CanonicalOrdering; FlattenedAssociativeChain chain(expr); + if (chain.was_left_folded && std::is_sorted(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering)) { + // fast path for expressions which happen to have arrived in an + // already-canonical form AlreadyCanonicalized.Add(std::move(chain.exprs)); return expr; } @@ -903,16 +933,17 @@ Result Canonicalize(Expression expr, compute::ExecContext* exec_cont std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering); // fold the chain back up - auto folded = - FoldLeft(chain.fringe.begin(), chain.fringe.end(), - [call, &AlreadyCanonicalized](Expression l, Expression r) { - auto canonicalized_call = *call; - canonicalized_call.arguments = {std::move(l), std::move(r)}; - Expression expr(std::move(canonicalized_call)); - AlreadyCanonicalized.Add({expr}); - return expr; - }); - return std::move(*folded); + Expression folded = std::move(chain.fringe.front()); + + for (auto it = chain.fringe.begin() + 1; it != chain.fringe.end(); ++it) { + auto canonicalized_call = *call; + canonicalized_call.arguments = {std::move(folded), std::move(*it)}; + ARROW_ASSIGN_OR_RAISE( + folded, + HandleInconsistentTypes(std::move(canonicalized_call), exec_context)); + AlreadyCanonicalized.Add({expr}); + } + return folded; } if (auto cmp = Comparison::Get(call->function_name)) { @@ -953,24 +984,24 @@ struct Inequality { // possibly disjuncted with an "is_null" Expression. // cmp(a, 2) // cmp(a, 2) or is_null(a) - static util::optional ExtractOne(const Expression& guarantee) { + static std::optional ExtractOne(const Expression& guarantee) { auto call = guarantee.call(); - if (!call) return util::nullopt; + if (!call) return std::nullopt; if (call->function_name == "or_kleene") { // expect the LHS to be a usable field inequality auto out = ExtractOneFromComparison(call->arguments[0]); - if (!out) return util::nullopt; + if (!out) return std::nullopt; // expect the RHS to be an is_null expression auto call_rhs = call->arguments[1].call(); - if (!call_rhs) return util::nullopt; - if (call_rhs->function_name != "is_null") return util::nullopt; + if (!call_rhs) return std::nullopt; + if (call_rhs->function_name != "is_null") return std::nullopt; // ... and that it references the same target auto target = call_rhs->arguments[0].field_ref(); - if (!target) return util::nullopt; - if (*target != out->target) return util::nullopt; + if (!target) return std::nullopt; + if (*target != out->target) return std::nullopt; out->nullable = true; return out; @@ -980,26 +1011,25 @@ struct Inequality { return ExtractOneFromComparison(guarantee); } - static util::optional ExtractOneFromComparison( - const Expression& guarantee) { + static std::optional ExtractOneFromComparison(const Expression& guarantee) { auto call = guarantee.call(); - if (!call) return util::nullopt; + if (!call) return std::nullopt; if (auto cmp = Comparison::Get(call->function_name)) { // not_equal comparisons are not very usable as guarantees - if (*cmp == Comparison::NOT_EQUAL) return util::nullopt; + if (*cmp == Comparison::NOT_EQUAL) return std::nullopt; auto target = call->arguments[0].field_ref(); - if (!target) return util::nullopt; + if (!target) return std::nullopt; auto bound = call->arguments[1].literal(); - if (!bound) return util::nullopt; - if (!bound->is_scalar()) return util::nullopt; + if (!bound) return std::nullopt; + if (!bound->is_scalar()) return std::nullopt; return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false}; } - return util::nullopt; + return std::nullopt; } /// The given expression simplifies to `value` if the inequality @@ -1114,7 +1144,7 @@ Result SimplifyIsValidGuarantee(Expression expr, const Expression::Call& guarantee) { if (guarantee.function_name != "is_valid") return expr; - return Modify( + return ModifyExpression( std::move(expr), [](Expression expr) { return expr; }, [&](Expression expr, ...) -> Result { auto call = expr.call(); @@ -1156,7 +1186,7 @@ Result SimplifyWithGuarantee(Expression expr, if (auto inequality = Inequality::ExtractOne(guarantee)) { ARROW_ASSIGN_OR_RAISE(auto simplified, - Modify( + ModifyExpression( std::move(expr), [](Expression expr) { return expr; }, [&](Expression expr, ...) -> Result { return inequality->Simplify(std::move(expr)); @@ -1183,6 +1213,27 @@ Result SimplifyWithGuarantee(Expression expr, return expr; } +Result RemoveNamedRefs(Expression src) { + if (!src.IsBound()) { + return Status::Invalid("RemoveNamedRefs called on unbound expression"); + } + return ModifyExpression( + std::move(src), + /*pre=*/ + [](Expression expr) { + const Expression::Parameter* param = expr.parameter(); + if (param && !param->ref.IsFieldPath()) { + FieldPath ref_as_path( + std::vector(param->indices.begin(), param->indices.end())); + return Expression( + Expression::Parameter{std::move(ref_as_path), param->type, param->indices}); + } + + return expr; + }, + /*post_call=*/[](Expression expr, ...) { return expr; }); +} + // Serialization is accomplished by converting expressions to KeyValueMetadata and storing // this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its // columns. Finally, the RecordBatch is written to an IPC file. @@ -1195,12 +1246,12 @@ Result> Serialize(const Expression& expr) { auto ret = columns_.size(); ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1)); columns_.push_back(std::move(array)); - return std::to_string(ret); + return ToChars(ret); } Status VisitFieldRef(const FieldRef& ref) { if (ref.nested_refs()) { - metadata_->Append("nested_field_ref", std::to_string(ref.nested_refs()->size())); + metadata_->Append("nested_field_ref", ToChars(ref.nested_refs()->size())); for (const auto& child : *ref.nested_refs()) { RETURN_NOT_OK(VisitFieldRef(child)); } @@ -1407,12 +1458,13 @@ Expression and_(Expression lhs, Expression rhs) { } Expression and_(const std::vector& operands) { - auto folded = FoldLeft(operands.begin(), - operands.end(), and_); - if (folded) { - return std::move(*folded); + if (operands.empty()) return literal(true); + + Expression folded = operands.front(); + for (auto it = operands.begin() + 1; it != operands.end(); ++it) { + folded = and_(std::move(folded), std::move(*it)); } - return literal(true); + return folded; } Expression or_(Expression lhs, Expression rhs) { @@ -1420,12 +1472,13 @@ Expression or_(Expression lhs, Expression rhs) { } Expression or_(const std::vector& operands) { - auto folded = - FoldLeft(operands.begin(), operands.end(), or_); - if (folded) { - return std::move(*folded); + if (operands.empty()) return literal(false); + + Expression folded = operands.front(); + for (auto it = operands.begin() + 1; it != operands.end(); ++it) { + folded = or_(std::move(folded), std::move(*it)); } - return literal(false); + return folded; } Expression not_(Expression operand) { return call("invert", {std::move(operand)}); } diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index a872e799597..c9c7b0e605f 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -22,13 +22,13 @@ #include #include #include +#include #include #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/type_fwd.h" #include "arrow/util/small_vector.h" -#include "arrow/util/variant.h" namespace arrow { namespace compute { @@ -100,6 +100,8 @@ class ARROW_EXPORT Expression { // XXX someday // Result GetPipelines(); + bool is_valid() const { return impl_ != NULLPTR; } + /// Access a Call or return nullptr if this expression is not a call const Call* call() const; /// Access a Datum or return nullptr if this expression is not a literal @@ -127,17 +129,17 @@ class ARROW_EXPORT Expression { explicit Expression(Parameter parameter); private: - using Impl = util::Variant; + using Impl = std::variant; std::shared_ptr impl_; - ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r); - - ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*); + ARROW_FRIEND_EXPORT friend bool Identical(const Expression& l, const Expression& r); }; inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); } inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); } +ARROW_EXPORT void PrintTo(const Expression&, std::ostream*); + // Factories ARROW_EXPORT @@ -218,6 +220,12 @@ ARROW_EXPORT Result SimplifyWithGuarantee(Expression, const Expression& guaranteed_true_predicate); +/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3]) +/// +/// This isn't usually needed and does not offer any simplification by itself. However, +/// it can be useful to normalize an expression to paths to make it simpler to work with. +ARROW_EXPORT Result RemoveNamedRefs(Expression expression); + /// @} // Execution diff --git a/cpp/src/arrow/compute/exec/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc index debd2284980..e431497e45b 100644 --- a/cpp/src/arrow/compute/exec/expression_benchmark.cc +++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc @@ -17,6 +17,8 @@ #include "benchmark/benchmark.h" +#include + #include "arrow/compute/cast.h" #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/test_util.h" diff --git a/cpp/src/arrow/compute/exec/expression_internal.h b/cpp/src/arrow/compute/exec/expression_internal.h index 027c954c6d0..9e29b8e27f9 100644 --- a/cpp/src/arrow/compute/exec/expression_internal.h +++ b/cpp/src/arrow/compute/exec/expression_internal.h @@ -287,52 +287,5 @@ inline Result> GetFunction( return GetCastFunction(*to_type); } -/// Modify an Expression with pre-order and post-order visitation. -/// `pre` will be invoked on each Expression. `pre` will visit Calls before their -/// arguments, `post_call` will visit Calls (and no other Expressions) after their -/// arguments. Visitors should return the Identical expression to indicate no change; this -/// will prevent unnecessary construction in the common case where a modification is not -/// possible/necessary/... -/// -/// If an argument was modified, `post_call` visits a reconstructed Call with the modified -/// arguments but also receives a pointer to the unmodified Expression as a second -/// argument. If no arguments were modified the unmodified Expression* will be nullptr. -template -Result Modify(Expression expr, const PreVisit& pre, - const PostVisitCall& post_call) { - ARROW_ASSIGN_OR_RAISE(expr, Result(pre(std::move(expr)))); - - auto call = expr.call(); - if (!call) return expr; - - bool at_least_one_modified = false; - std::vector modified_arguments; - - for (size_t i = 0; i < call->arguments.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto modified_argument, - Modify(call->arguments[i], pre, post_call)); - - if (Identical(modified_argument, call->arguments[i])) { - continue; - } - - if (!at_least_one_modified) { - modified_arguments = call->arguments; - at_least_one_modified = true; - } - - modified_arguments[i] = std::move(modified_argument); - } - - if (at_least_one_modified) { - // reconstruct the call expression with the modified arguments - auto modified_call = *call; - modified_call.arguments = std::move(modified_arguments); - return post_call(Expression(std::move(modified_call)), &expr); - } - - return post_call(std::move(expr), nullptr); -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc index b4466d827eb..6dc48b3be4e 100644 --- a/cpp/src/arrow/compute/exec/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -17,6 +17,7 @@ #include "arrow/compute/exec/expression.h" +#include #include #include #include @@ -30,11 +31,12 @@ #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" #include "arrow/testing/gtest_util.h" -#include "arrow/util/make_unique.h" using testing::HasSubstr; using testing::UnorderedElementsAreArray; +using namespace std::chrono_literals; // NOLINT build/namespaces + namespace arrow { using internal::checked_cast; @@ -57,6 +59,7 @@ const std::shared_ptr kBoringSchema = schema({ field("dict_str", dictionary(int32(), utf8())), field("dict_i32", dictionary(int32(), int32())), field("ts_ns", timestamp(TimeUnit::NANO)), + field("ts_s", timestamp(TimeUnit::SECOND)), }); #define EXPECT_OK ARROW_EXPECT_OK @@ -70,6 +73,10 @@ Expression true_unless_null(Expression argument) { return call("true_unless_null", {std::move(argument)}); } +Expression add(Expression l, Expression r) { + return call("add", {std::move(l), std::move(r)}); +} + template void ExpectResultsEqual(Actual&& actual, Expected&& expected) { using MaybeActual = typename EnsureResult::type>::type; @@ -86,7 +93,7 @@ void ExpectResultsEqual(Actual&& actual, Expected&& expected) { } } -const auto no_change = util::nullopt; +const auto no_change = std::nullopt; TEST(ExpressionUtils, Comparison) { auto Expect = [](Result expected, Datum l, Datum r) { @@ -122,7 +129,7 @@ TEST(ExpressionUtils, Comparison) { } TEST(ExpressionUtils, StripOrderPreservingCasts) { - auto Expect = [](Expression expr, util::optional expected_stripped) { + auto Expect = [](Expression expr, std::optional expected_stripped) { ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); if (!expected_stripped) { expected_stripped = expr; @@ -242,7 +249,7 @@ class WidgetifyOptionsType : public FunctionOptionsType { } std::unique_ptr Copy(const FunctionOptions& options) const override { const auto& opts = static_cast(options); - return arrow::internal::make_unique(opts.really); + return std::make_unique(opts.really); } }; WidgetifyOptions::WidgetifyOptions(bool really) @@ -259,10 +266,10 @@ TEST(Expression, ToString) { EXPECT_EQ(literal(std::make_shared(Buffer::FromString("az"))).ToString(), "\"617A\""); - auto ts = *MakeScalar("1990-10-23 10:23:33")->CastTo(timestamp(TimeUnit::NANO)); + auto ts = *TimestampScalar::FromISO8601("1990-10-23 10:23:33", TimeUnit::NANO); EXPECT_EQ(literal(ts).ToString(), "1990-10-23 10:23:33.000000000"); - EXPECT_EQ(call("add", {literal(3), field_ref("beta")}).ToString(), "add(3, beta)"); + EXPECT_EQ(add(literal(3), field_ref("beta")).ToString(), "add(3, beta)"); auto in_12 = call("index_in", {field_ref("beta")}, compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); @@ -285,8 +292,7 @@ TEST(Expression, ToString) { "allow_time_overflow=false, allow_decimal_truncate=false, " "allow_float_truncate=false, allow_invalid_utf8=false})"); - // NB: corrupted for nullary functions but we don't have any of those - EXPECT_EQ(call("widgetify", {}).ToString(), "widgetif)"); + EXPECT_EQ(call("widgetify", {}).ToString(), "widgetify()"); EXPECT_EQ( call("widgetify", {literal(1)}, std::make_shared()).ToString(), "widgetify(1, widgetify)"); @@ -313,6 +319,11 @@ TEST(Expression, ToString) { }) .ToString(), "{a=a, renamed_a=a, three=3, b=" + in_12.ToString() + "}"); + + EXPECT_EQ(call("round", {literal(3.14)}, compute::RoundOptions()).ToString(), + "round(3.14, {ndigits=0, round_mode=HALF_TO_EVEN})"); + EXPECT_EQ(call("random", {}, compute::RandomOptions()).ToString(), + "random({initializer=SystemRandom, seed=0})"); } TEST(Expression, Equality) { @@ -323,20 +334,17 @@ TEST(Expression, Equality) { EXPECT_NE(field_ref("a"), field_ref("b")); EXPECT_NE(field_ref("a"), literal(2)); - EXPECT_EQ(call("add", {literal(3), field_ref("a")}), - call("add", {literal(3), field_ref("a")})); - EXPECT_NE(call("add", {literal(3), field_ref("a")}), - call("add", {literal(2), field_ref("a")})); - EXPECT_NE(call("add", {field_ref("a"), literal(3)}), - call("add", {literal(3), field_ref("a")})); + EXPECT_EQ(add(literal(3), field_ref("a")), add(literal(3), field_ref("a"))); + EXPECT_NE(add(literal(3), field_ref("a")), add(literal(2), field_ref("a"))); + EXPECT_NE(add(field_ref("a"), literal(3)), add(literal(3), field_ref("a"))); auto in_123 = compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]")}; - EXPECT_EQ(call("add", {literal(3), call("index_in", {field_ref("beta")}, in_123)}), - call("add", {literal(3), call("index_in", {field_ref("beta")}, in_123)})); + EXPECT_EQ(add(literal(3), call("index_in", {field_ref("beta")}, in_123)), + add(literal(3), call("index_in", {field_ref("beta")}, in_123))); auto in_12 = compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}; - EXPECT_NE(call("add", {literal(3), call("index_in", {field_ref("beta")}, in_12)}), - call("add", {literal(3), call("index_in", {field_ref("beta")}, in_123)})); + EXPECT_NE(add(literal(3), call("index_in", {field_ref("beta")}, in_12)), + add(literal(3), call("index_in", {field_ref("beta")}, in_123))); EXPECT_EQ(cast(field_ref("a"), int32()), cast(field_ref("a"), int32())); EXPECT_NE(cast(field_ref("a"), int32()), cast(field_ref("a"), int64())); @@ -467,7 +475,7 @@ TEST(Expression, FieldsInExpression) { TEST(Expression, ExpressionHasFieldRefs) { EXPECT_FALSE(ExpressionHasFieldRefs(literal(true))); - EXPECT_FALSE(ExpressionHasFieldRefs(call("add", {literal(1), literal(3)}))); + EXPECT_FALSE(ExpressionHasFieldRefs(add(literal(1), literal(3)))); EXPECT_TRUE(ExpressionHasFieldRefs(field_ref("a"))); @@ -499,7 +507,7 @@ TEST(Expression, BindLiteral) { } } -void ExpectBindsTo(Expression expr, util::optional expected, +void ExpectBindsTo(Expression expr, std::optional expected, Expression* bound_out = nullptr, const Schema& schema = *kBoringSchema) { if (!expected) { @@ -554,17 +562,16 @@ TEST(Expression, BindNestedFieldRef) { } TEST(Expression, BindCall) { - auto expr = call("add", {field_ref("i32"), field_ref("i32_req")}); + auto expr = add(field_ref("i32"), field_ref("i32_req")); EXPECT_FALSE(expr.IsBound()); ExpectBindsTo(expr, no_change, &expr); EXPECT_TRUE(expr.type()->Equals(*int32())); - ExpectBindsTo(call("add", {field_ref("f32"), literal(3)}), - call("add", {field_ref("f32"), literal(3.0F)})); + ExpectBindsTo(add(field_ref("f32"), literal(3)), add(field_ref("f32"), literal(3.0F))); - ExpectBindsTo(call("add", {field_ref("i32"), literal(3.5F)}), - call("add", {cast(field_ref("i32"), float32()), literal(3.5F)})); + ExpectBindsTo(add(field_ref("i32"), literal(3.5F)), + add(cast(field_ref("i32"), float32()), literal(3.5F))); } TEST(Expression, BindWithImplicitCasts) { @@ -598,10 +605,9 @@ TEST(Expression, BindWithImplicitCasts) { } TEST(Expression, BindNestedCall) { - auto expr = - call("add", {field_ref("a"), - call("subtract", {call("multiply", {field_ref("b"), field_ref("c")}), - field_ref("d")})}); + auto expr = add(field_ref("a"), + call("subtract", {call("multiply", {field_ref("b"), field_ref("c")}), + field_ref("d")})); EXPECT_FALSE(expr.IsBound()); ASSERT_OK_AND_ASSIGN(expr, @@ -745,7 +751,7 @@ void ExpectExecute(Expression expr, Datum in, Datum* actual_out = NULLPTR) { } TEST(Expression, ExecuteCall) { - ExpectExecute(call("add", {field_ref("a"), literal(3.5)}), + ExpectExecute(add(field_ref("a"), literal(3.5)), ArrayFromJSON(struct_({field("a", float64())}), R"([ {"a": 6.125}, {"a": 0.0}, @@ -753,7 +759,7 @@ TEST(Expression, ExecuteCall) { ])")); ExpectExecute( - call("add", {field_ref("a"), call("subtract", {literal(3.5), field_ref("b")})}), + add(field_ref("a"), call("subtract", {literal(3.5), field_ref("b")})), ArrayFromJSON(struct_({field("a", float64()), field("b", float64())}), R"([ {"a": 6.125, "b": 3.375}, {"a": 0.0, "b": 1}, @@ -768,20 +774,19 @@ TEST(Expression, ExecuteCall) { {"a": "12/11/1900"} ])")); - ExpectExecute(project({call("add", {field_ref("a"), literal(3.5)})}, {"a + 3.5"}), + ExpectExecute(project({add(field_ref("a"), literal(3.5))}, {"a + 3.5"}), ArrayFromJSON(struct_({field("a", float64())}), R"([ {"a": 6.125}, {"a": 0.0}, {"a": -1} ])")); - ExpectExecute( - call("add", {field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))}), - ArrayFromJSON(struct_({field("a", struct_({ - field("a", float64()), - field("b", float64()), - }))}), - R"([ + ExpectExecute(add(field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))), + ArrayFromJSON(struct_({field("a", struct_({ + field("a", float64()), + field("b", float64()), + }))}), + R"([ {"a": {"a": 6.125, "b": 3.375}}, {"a": {"a": 0.0, "b": 1}}, {"a": {"a": -1, "b": 4.75}} @@ -850,24 +855,30 @@ TEST(Expression, FoldConstants) { ExpectFoldsTo(field_ref("i32"), field_ref("i32")); // call against literals (3 + 2 == 5) - ExpectFoldsTo(call("add", {literal(3), literal(2)}), literal(5)); + ExpectFoldsTo(add(literal(3), literal(2)), literal(5)); + + ExpectFoldsTo(equal(literal(3), literal(3)), literal(true)); - ExpectFoldsTo(call("equal", {literal(3), literal(3)}), literal(true)); + // addition of durations folds as expected + ExpectFoldsTo(add(literal(5min), literal(5min)), literal(10min)); + + // addition of duration, timestamp folds as expected + auto ts = *TimestampScalar::FromISO8601("1990-10-23 10:23:33", TimeUnit::SECOND); + auto ts_two_hours_later = + *TimestampScalar::FromISO8601("1990-10-23 12:23:33", TimeUnit::SECOND); + ExpectFoldsTo(add(literal(2h), literal(ts)), literal(ts_two_hours_later)); + ExpectFoldsTo(add(literal(ts), literal(2h)), literal(ts_two_hours_later)); // call against literal and field_ref - ExpectFoldsTo(call("add", {literal(3), field_ref("i32")}), - call("add", {literal(3), field_ref("i32")})); + ExpectFoldsTo(add(literal(3), field_ref("i32")), add(literal(3), field_ref("i32"))); // nested call against literals ((8 - (2 * 3)) + 2 == 4) - ExpectFoldsTo(call("add", - { - call("subtract", - { - literal(8), - call("multiply", {literal(2), literal(3)}), - }), - literal(2), - }), + ExpectFoldsTo(add(call("subtract", + { + literal(8), + call("multiply", {literal(2), literal(3)}), + }), + literal(2)), literal(4)); // INTERSECTION null handling and null input -> null output @@ -877,40 +888,34 @@ TEST(Expression, FoldConstants) { // nested call against literals with one field_ref // (i32 - (2 * 3)) + 2 == (i32 - 6) + 2 // NB this could be improved further by using associativity of addition; another pass - ExpectFoldsTo(call("add", - { - call("subtract", - { - field_ref("i32"), - call("multiply", {literal(2), literal(3)}), - }), - literal(2), - }), - call("add", { - call("subtract", - { - field_ref("i32"), - literal(6), - }), - literal(2), - })); + ExpectFoldsTo(add(call("subtract", + { + field_ref("i32"), + call("multiply", {literal(2), literal(3)}), + }), + literal(2)), + add(call("subtract", + { + field_ref("i32"), + literal(6), + }), + literal(2))); compute::SetLookupOptions in_123(ArrayFromJSON(int32(), "[1,2,3]")); ExpectFoldsTo(call("is_in", {literal(2)}, in_123), literal(true)); ExpectFoldsTo( - call("is_in", - {call("add", {field_ref("i32"), call("multiply", {literal(2), literal(3)})})}, + call("is_in", {add(field_ref("i32"), call("multiply", {literal(2), literal(3)}))}, in_123), - call("is_in", {call("add", {field_ref("i32"), literal(6)})}, in_123)); + call("is_in", {add(field_ref("i32"), literal(6))}, in_123)); } TEST(Expression, FoldConstantsBoolean) { // test and_kleene/or_kleene-specific optimizations auto one = literal(1); auto two = literal(2); - auto whatever = equal(call("add", {one, field_ref("i32")}), two); + auto whatever = equal(add(one, field_ref("i32")), two); auto true_ = literal(true); auto false_ = literal(false); @@ -924,6 +929,24 @@ TEST(Expression, FoldConstantsBoolean) { ExpectFoldsTo(or_(whatever, whatever), whatever); } +void ExpectRemovesRefsTo(Expression expr, Expression expected, + const Schema& schema = *kBoringSchema) { + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(schema)); + ASSERT_OK_AND_ASSIGN(expected, expected.Bind(schema)); + + ASSERT_OK_AND_ASSIGN(auto without_named_refs, RemoveNamedRefs(expr)); + + EXPECT_EQ(without_named_refs, expected); +} + +TEST(Expression, RemoveNamedRefs) { + ExpectRemovesRefsTo(field_ref("i32"), field_ref(2)); + ExpectRemovesRefsTo(call("add", {literal(4), field_ref("i32")}), + call("add", {literal(4), field_ref(2)})); + auto nested_schema = Schema({field("a", struct_({field("b", int32())}))}); + ExpectRemovesRefsTo(field_ref({"a", "b"}), field_ref({0, 0}), nested_schema); +} + TEST(Expression, ExtractKnownFieldValues) { struct { void operator()(Expression guarantee, @@ -1006,24 +1029,19 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(utf8(), R"(["3"])"))}; ExpectReplacesTo(field_ref("dict_str"), {{"dict_str", dict_str}}, literal(dict_str)); - ExpectReplacesTo(call("add", - { - call("subtract", - { - field_ref("i32"), - call("multiply", {literal(2), literal(3)}), - }), - literal(2), - }), + ExpectReplacesTo(add(call("subtract", + { + field_ref("i32"), + call("multiply", {literal(2), literal(3)}), + }), + literal(2)), i32_is_3, - call("add", { - call("subtract", - { - literal(3), - call("multiply", {literal(2), literal(3)}), - }), - literal(2), - })); + add(call("subtract", + { + literal(3), + call("multiply", {literal(2), literal(3)}), + }), + literal(2))); std::unordered_map i32_valid_str_null{ {"i32", Datum(3)}, {"str", MakeNullScalar(utf8())}}; @@ -1097,6 +1115,13 @@ TEST(Expression, CanonicalizeAnd) { ExpectCanonicalizesTo(is_valid(and_(b, true_)), is_valid(and_(true_, b))); } +TEST(Expression, CanonicalizeAdd) { + auto ts = field_ref("ts_s"); + ExpectCanonicalizesTo(add(ts, literal(5min)), add(literal(5min), ts)); + ExpectCanonicalizesTo(add(add(ts, literal(5min)), add(literal(5min), literal(5min))), + add(add(add(literal(5min), literal(5min)), literal(5min)), ts)); +} + TEST(Expression, CanonicalizeComparison) { ExpectCanonicalizesTo(equal(literal(1), field_ref("i32")), equal(field_ref("i32"), literal(1))); @@ -1141,7 +1166,7 @@ TEST(Expression, SingleComparisonGuarantees) { // i32 is guaranteed equal to 3, so the projection can just materialize that constant // and need not incur IO - Simplify{project({call("add", {i32, literal(1)})}, {"i32 + 1"})} + Simplify{project({add(i32, literal(1))}, {"i32 + 1"})} .WithGuarantee(equal(i32, literal(3))) .Expect(literal( std::make_shared(ScalarVector{std::make_shared(4)}, @@ -1357,6 +1382,10 @@ TEST(Expression, SimplifyWithValidityGuarantee) { .WithGuarantee(is_null(field_ref("i32"))) .Expect(literal(false)); + Simplify{{true_unless_null(field_ref("i32"))}} + .WithGuarantee(is_null(field_ref("i32"))) + .Expect(null_literal(boolean())); + Simplify{is_valid(field_ref("i32"))} .WithGuarantee(is_valid(field_ref("i32"))) .Expect(literal(true)); @@ -1372,6 +1401,21 @@ TEST(Expression, SimplifyWithValidityGuarantee) { Simplify{true_unless_null(field_ref("i32"))} .WithGuarantee(is_valid(field_ref("i32"))) .Expect(literal(true)); + + Simplify{{equal(field_ref("i32"), literal(7))}} + .WithGuarantee(is_null(field_ref("i32"))) + .Expect(null_literal(boolean())); + + auto i32_is_2_or_null = + or_(equal(field_ref("i32"), literal(2)), is_null(field_ref("i32"))); + + Simplify{i32_is_2_or_null} + .WithGuarantee(is_null(field_ref("i32"))) + .Expect(literal(true)); + + Simplify{{greater(field_ref("i32"), literal(7))}} + .WithGuarantee(is_null(field_ref("i32"))) + .Expect(null_literal(boolean())); } TEST(Expression, SimplifyWithComparisonAndNullableCaveat) { diff --git a/cpp/src/arrow/compute/exec/filter_benchmark.cc b/cpp/src/arrow/compute/exec/filter_benchmark.cc index 64cf307580b..aa8e3e8b77d 100644 --- a/cpp/src/arrow/compute/exec/filter_benchmark.cc +++ b/cpp/src/arrow/compute/exec/filter_benchmark.cc @@ -76,23 +76,20 @@ static void FilterOverhead(benchmark::State& state, std::vector expr arrow::compute::BatchesWithSchema data = MakeRandomBatchesWithNullProbability( schema({field("i64", int64()), field("bool", boolean())}), num_batches, batch_size, null_prob, bool_true_probability); - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::vector filter_node_dec; for (Expression expr : expr_vector) { filter_node_dec.push_back({"filter", FilterNodeOptions(expr)}); } - ASSERT_OK( - BenchmarkNodeOverhead(state, ctx, num_batches, batch_size, data, filter_node_dec)); + ASSERT_OK(BenchmarkNodeOverhead(state, num_batches, batch_size, data, filter_node_dec)); } static void FilterOverheadIsolated(benchmark::State& state, Expression expr) { - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); const int32_t batch_size = static_cast(state.range(0)); const int32_t num_batches = kTotalBatchSize / batch_size; arrow::compute::BatchesWithSchema data = MakeRandomBatches( schema({field("i64", int64()), field("bool", boolean())}), num_batches, batch_size); FilterNodeOptions options = FilterNodeOptions{expr}; - ASSERT_OK(BenchmarkIsolatedNodeOverhead(state, ctx, expr, num_batches, batch_size, data, + ASSERT_OK(BenchmarkIsolatedNodeOverhead(state, expr, num_batches, batch_size, data, "filter", options)); } diff --git a/cpp/src/arrow/compute/exec/filter_node.cc b/cpp/src/arrow/compute/exec/filter_node.cc index b424da35f85..8274453b6c7 100644 --- a/cpp/src/arrow/compute/exec/filter_node.cc +++ b/cpp/src/arrow/compute/exec/filter_node.cc @@ -19,7 +19,9 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/map_node.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" @@ -37,8 +39,8 @@ namespace { class FilterNode : public MapNode { public: FilterNode(ExecPlan* plan, std::vector inputs, - std::shared_ptr output_schema, Expression filter, bool async_mode) - : MapNode(plan, std::move(inputs), std::move(output_schema), async_mode), + std::shared_ptr output_schema, Expression filter) + : MapNode(plan, std::move(inputs), std::move(output_schema)), filter_(std::move(filter)) {} static Result Make(ExecPlan* plan, std::vector inputs, @@ -50,8 +52,9 @@ class FilterNode : public MapNode { auto filter_expression = filter_options.filter_expression; if (!filter_expression.IsBound()) { - ARROW_ASSIGN_OR_RAISE(filter_expression, - filter_expression.Bind(*schema, plan->exec_context())); + ARROW_ASSIGN_OR_RAISE( + filter_expression, + filter_expression.Bind(*schema, plan->query_context()->exec_context())); } if (filter_expression.type()->id() != Type::BOOL) { @@ -60,8 +63,7 @@ class FilterNode : public MapNode { filter_expression.type()->ToString()); } return plan->EmplaceNode(plan, std::move(inputs), std::move(schema), - std::move(filter_expression), - filter_options.async_mode); + std::move(filter_expression)); } const char* kind_name() const override { return "FilterNode"; } @@ -76,8 +78,9 @@ class FilterNode : public MapNode { {"filter.expression.simplified", simplified_filter.ToString()}, {"filter.length", target.length}}); - ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target, - plan()->exec_context())); + ARROW_ASSIGN_OR_RAISE( + Datum mask, ExecuteScalarExpression(simplified_filter, target, + plan()->query_context()->exec_context())); if (mask.is_scalar()) { const auto& mask_scalar = mask.scalar_as(); diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc index 5cf66b3d09e..ffd93591e65 100644 --- a/cpp/src/arrow/compute/exec/hash_join.cc +++ b/cpp/src/arrow/compute/exec/hash_join.cc @@ -26,7 +26,6 @@ #include #include "arrow/compute/exec/hash_join_dict.h" -#include "arrow/compute/exec/key_hash.h" #include "arrow/compute/exec/task_util.h" #include "arrow/compute/kernels/row_encoder.h" #include "arrow/compute/row/encode_internal.h" @@ -40,7 +39,7 @@ class HashJoinBasicImpl : public HashJoinImpl { struct ThreadLocalState; public: - Status Init(ExecContext* ctx, JoinType join_type, size_t num_threads, + Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, std::vector key_cmp, Expression filter, @@ -99,7 +98,7 @@ class HashJoinBasicImpl : public HashJoinImpl { for (int icol = 0; icol < num_cols; ++icol) { data_types[icol] = schema_[side]->data_type(projection_handle, icol); } - encoder->Init(data_types, ctx_); + encoder->Init(data_types, ctx_->exec_context()); encoder->Clear(); } @@ -297,8 +296,8 @@ class HashJoinBasicImpl : public HashJoinImpl { AppendFields(left_to_key, left_to_pay, left_key, left_payload); AppendFields(right_to_key, right_to_pay, right_key, right_payload); - ARROW_ASSIGN_OR_RAISE(Datum mask, - ExecuteScalarExpression(filter_, concatenated, ctx_)); + ARROW_ASSIGN_OR_RAISE( + Datum mask, ExecuteScalarExpression(filter_, concatenated, ctx_->exec_context())); size_t num_probed_rows = match.size() + no_match.size(); if (mask.is_scalar()) { @@ -398,7 +397,8 @@ class HashJoinBasicImpl : public HashJoinImpl { ARROW_ASSIGN_OR_RAISE(right_key, hash_table_keys_.Decode(batch_size_next, opt_right_ids)); // Post process build side keys that use dictionary - RETURN_NOT_OK(dict_build_.PostDecode(*schema_[1], &right_key, ctx_)); + RETURN_NOT_OK( + dict_build_.PostDecode(*schema_[1], &right_key, ctx_->exec_context())); } if (has_right_payload) { ARROW_ASSIGN_OR_RAISE(right_payload, @@ -510,13 +510,13 @@ class HashJoinBasicImpl : public HashJoinImpl { local_state.match_left.clear(); local_state.match_right.clear(); - bool use_key_batch_for_dicts = - dict_probe_.BatchRemapNeeded(thread_index, *schema_[0], *schema_[1], ctx_); + bool use_key_batch_for_dicts = dict_probe_.BatchRemapNeeded( + thread_index, *schema_[0], *schema_[1], ctx_->exec_context()); RowEncoder* row_encoder_for_lookups = &local_state.exec_batch_keys; if (use_key_batch_for_dicts) { - RETURN_NOT_OK(dict_probe_.EncodeBatch(thread_index, *schema_[0], *schema_[1], - dict_build_, batch, &row_encoder_for_lookups, - &batch_key_for_lookups, ctx_)); + RETURN_NOT_OK(dict_probe_.EncodeBatch( + thread_index, *schema_[0], *schema_[1], dict_build_, batch, + &row_encoder_for_lookups, &batch_key_for_lookups, ctx_->exec_context())); } // Collect information about all nulls in key columns. @@ -561,7 +561,7 @@ class HashJoinBasicImpl : public HashJoinImpl { Status BuildHashTable_exec_task(size_t thread_index, int64_t /*task_id*/) { AccumulationQueue batches = std::move(build_batches_); - dict_build_.InitEncoder(*schema_[1], &hash_table_keys_, ctx_); + dict_build_.InitEncoder(*schema_[1], &hash_table_keys_, ctx_->exec_context()); bool has_payload = (schema_[1]->num_cols(HashJoinProjection::PAYLOAD) > 0); if (has_payload) { InitEncoder(1, HashJoinProjection::PAYLOAD, &hash_table_payloads_); @@ -578,11 +578,11 @@ class HashJoinBasicImpl : public HashJoinImpl { } else if (hash_table_empty_) { hash_table_empty_ = false; - RETURN_NOT_OK(dict_build_.Init(*schema_[1], &batch, ctx_)); + RETURN_NOT_OK(dict_build_.Init(*schema_[1], &batch, ctx_->exec_context())); } int32_t num_rows_before = hash_table_keys_.num_rows(); RETURN_NOT_OK(dict_build_.EncodeBatch(thread_index, *schema_[1], batch, - &hash_table_keys_, ctx_)); + &hash_table_keys_, ctx_->exec_context())); if (has_payload) { RETURN_NOT_OK( EncodeBatch(1, HashJoinProjection::PAYLOAD, &hash_table_payloads_, batch)); @@ -594,7 +594,7 @@ class HashJoinBasicImpl : public HashJoinImpl { } if (hash_table_empty_) { - RETURN_NOT_OK(dict_build_.Init(*schema_[1], nullptr, ctx_)); + RETURN_NOT_OK(dict_build_.Init(*schema_[1], nullptr, ctx_->exec_context())); } return Status::OK(); @@ -735,7 +735,7 @@ class HashJoinBasicImpl : public HashJoinImpl { // Metadata // - ExecContext* ctx_; + QueryContext* ctx_; JoinType join_type_; size_t num_threads_; const HashJoinProjectionMaps* schema_[2]; diff --git a/cpp/src/arrow/compute/exec/hash_join.h b/cpp/src/arrow/compute/exec/hash_join.h index 0c5e43467e9..bc053b2f1b6 100644 --- a/cpp/src/arrow/compute/exec/hash_join.h +++ b/cpp/src/arrow/compute/exec/hash_join.h @@ -24,6 +24,7 @@ #include "arrow/compute/exec/accumulation_queue.h" #include "arrow/compute/exec/bloom_filter.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/task_util.h" #include "arrow/result.h" @@ -47,7 +48,7 @@ class HashJoinImpl { using AbortContinuationImpl = std::function; virtual ~HashJoinImpl() = default; - virtual Status Init(ExecContext* ctx, JoinType join_type, size_t num_threads, + virtual Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, std::vector key_cmp, Expression filter, diff --git a/cpp/src/arrow/compute/exec/hash_join_benchmark.cc b/cpp/src/arrow/compute/exec/hash_join_benchmark.cc index 94201a849fa..cc85251f8c1 100644 --- a/cpp/src/arrow/compute/exec/hash_join_benchmark.cc +++ b/cpp/src/arrow/compute/exec/hash_join_benchmark.cc @@ -25,7 +25,6 @@ #include "arrow/compute/exec/util.h" #include "arrow/compute/kernels/row_encoder.h" #include "arrow/testing/random.h" -#include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" #include @@ -126,10 +125,7 @@ class JoinBenchmark { stats_.num_probe_rows = settings.num_probe_batches * settings.batch_size; - ctx_ = arrow::internal::make_unique(default_memory_pool(), - arrow::internal::GetCpuThreadPool()); - - schema_mgr_ = arrow::internal::make_unique(); + schema_mgr_ = std::make_unique(); Expression filter = literal(true); DCHECK_OK(schema_mgr_->Init(settings.join_type, *l_batches_with_schema.schema, left_keys, *r_batches_with_schema.schema, right_keys, @@ -149,6 +145,7 @@ class JoinBenchmark { }; scheduler_ = TaskScheduler::Make(); + DCHECK_OK(ctx_.Init(settings.num_threads, nullptr)); auto register_task_group_callback = [&](std::function task, std::function cont) { @@ -160,11 +157,10 @@ class JoinBenchmark { }; DCHECK_OK(join_->Init( - ctx_.get(), settings.join_type, settings.num_threads, - &(schema_mgr_->proj_maps[0]), &(schema_mgr_->proj_maps[1]), std::move(key_cmp), - std::move(filter), std::move(register_task_group_callback), - std::move(start_task_group_callback), [](int64_t, ExecBatch) {}, - [](int64_t x) {})); + &ctx_, settings.join_type, settings.num_threads, &(schema_mgr_->proj_maps[0]), + &(schema_mgr_->proj_maps[1]), std::move(key_cmp), std::move(filter), + std::move(register_task_group_callback), std::move(start_task_group_callback), + [](int64_t, ExecBatch) {}, [](int64_t x) {})); task_group_probe_ = scheduler_->RegisterTaskGroup( [this](size_t thread_index, int64_t task_id) -> Status { @@ -200,7 +196,7 @@ class JoinBenchmark { AccumulationQueue r_batches_; std::unique_ptr schema_mgr_; std::unique_ptr join_; - std::unique_ptr ctx_; + QueryContext ctx_; int task_group_probe_; struct { diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc index 560b0ea8d4d..4ce89446d3c 100644 --- a/cpp/src/arrow/compute/exec/hash_join_dict.cc +++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc @@ -127,7 +127,7 @@ static Result> ConvertImp( } else { const auto& scalar = input.scalar_as(); if (scalar.is_valid) { - const util::string_view data = scalar.view(); + const std::string_view data = scalar.view(); DCHECK_EQ(data.size(), sizeof(FROM)); const FROM from = *reinterpret_cast(data.data()); const TO to_value = static_cast(from); diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc index 44667b9f283..37bdb82517a 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -29,7 +30,6 @@ #include "arrow/compute/exec/util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" -#include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" @@ -45,6 +45,9 @@ bool HashJoinSchema::IsTypeSupported(const DataType& type) { if (id == Type::DICTIONARY) { return IsTypeSupported(*checked_cast(type).value_type()); } + if (id == Type::EXTENSION) { + return IsTypeSupported(*checked_cast(type).storage_type()); + } return is_fixed_width(id) || is_binary_like(id) || is_large_binary_like(id); } @@ -487,7 +490,7 @@ struct BloomFilterPushdownContext { std::function, std::function)>; using StartTaskGroupCallback = std::function; using BuildFinishedCallback = std::function; - using FiltersReceivedCallback = std::function; + using FiltersReceivedCallback = std::function; using FilterFinishedCallback = std::function; void Init(HashJoinNode* owner, size_t num_threads, RegisterTaskGroupCallback register_task_group_callback, @@ -495,7 +498,7 @@ struct BloomFilterPushdownContext { FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter, bool use_sync_execution); - Status StartProducing(); + Status StartProducing(size_t thread_index); void ExpectBloomFilter() { eval_.num_expected_bloom_filters_ += 1; } @@ -505,10 +508,11 @@ struct BloomFilterPushdownContext { BuildFinishedCallback on_finished); // Sends the Bloom filter to the pushdown target. - Status PushBloomFilter(); + Status PushBloomFilter(size_t thread_index); // Receives a Bloom filter and its associated column map. - Status ReceiveBloomFilter(std::unique_ptr filter, + Status ReceiveBloomFilter(size_t thread_index, + std::unique_ptr filter, std::vector column_map) { bool proceed; { @@ -521,7 +525,7 @@ struct BloomFilterPushdownContext { ARROW_DCHECK_LE(eval_.received_filters_.size(), eval_.num_expected_bloom_filters_); } if (proceed) { - return eval_.all_received_callback_(); + return eval_.all_received_callback_(thread_index); } return Status::OK(); } @@ -550,7 +554,8 @@ struct BloomFilterPushdownContext { std::vector hashes(batch.length); std::vector bv(bit_vector_bytes); - ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * stack, GetStack(thread_index)); + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * stack, + ctx_->GetTempStack(thread_index)); // Start with full selection for the current batch memset(selected.data(), 0xff, bit_vector_bytes); @@ -576,8 +581,7 @@ struct BloomFilterPushdownContext { arrow::internal::BitmapAnd(bv.data(), 0, selected.data(), 0, key_batch.length, 0, selected.data()); } - auto selected_buffer = - arrow::internal::make_unique(selected.data(), bit_vector_bytes); + auto selected_buffer = std::make_unique(selected.data(), bit_vector_bytes); ArrayData selected_arraydata(boolean(), batch.length, {nullptr, std::move(selected_buffer)}); Datum selected_datum(selected_arraydata); @@ -585,8 +589,8 @@ struct BloomFilterPushdownContext { size_t first_nonscalar = batch.values.size(); for (size_t i = 0; i < batch.values.size(); i++) { if (!batch.values[i].is_scalar()) { - ARROW_ASSIGN_OR_RAISE(batch.values[i], - Filter(batch.values[i], selected_datum, options, ctx_)); + ARROW_ASSIGN_OR_RAISE(batch.values[i], Filter(batch.values[i], selected_datum, + options, ctx_->exec_context())); first_nonscalar = std::min(first_nonscalar, i); ARROW_DCHECK_EQ(batch.values[i].length(), batch.values[first_nonscalar].length()); } @@ -617,25 +621,10 @@ struct BloomFilterPushdownContext { // the disable_bloom_filter_ flag. std::pair> GetPushdownTarget(HashJoinNode* start); - Result GetStack(size_t thread_index) { - if (!tld_[thread_index].is_init) { - RETURN_NOT_OK(tld_[thread_index].stack.Init( - ctx_->memory_pool(), 4 * util::MiniBatch::kMiniBatchLength * sizeof(uint32_t))); - tld_[thread_index].is_init = true; - } - return &tld_[thread_index].stack; - } - StartTaskGroupCallback start_task_group_callback_; bool disable_bloom_filter_; HashJoinSchema* schema_mgr_; - ExecContext* ctx_; - - struct ThreadLocalData { - bool is_init = false; - util::TempVectorStack stack; - }; - std::vector tld_; + QueryContext* ctx_; struct { int task_id_; @@ -712,8 +701,7 @@ class HashJoinNode : public ExecNode { // Number of input exec nodes must be 2 RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 2, "HashJoinNode")); - std::unique_ptr schema_mgr = - ::arrow::internal::make_unique(); + std::unique_ptr schema_mgr = std::make_unique(); const auto& join_options = checked_cast(options); RETURN_NOT_OK(ValidateHashJoinNodeOptions(join_options)); @@ -735,9 +723,10 @@ class HashJoinNode : public ExecNode { join_options.output_suffix_for_left, join_options.output_suffix_for_right)); } - ARROW_ASSIGN_OR_RAISE(Expression filter, - schema_mgr->BindFilter(join_options.filter, left_schema, - right_schema, plan->exec_context())); + ARROW_ASSIGN_OR_RAISE( + Expression filter, + schema_mgr->BindFilter(join_options.filter, left_schema, right_schema, + plan->query_context()->exec_context())); // Generate output schema std::shared_ptr output_schema = schema_mgr->MakeOutputSchema( @@ -785,7 +774,7 @@ class HashJoinNode : public ExecNode { } Status OnBloomFilterFinished(size_t thread_index, AccumulationQueue batches) { - RETURN_NOT_OK(pushdown_context_.PushBloomFilter()); + RETURN_NOT_OK(pushdown_context_.PushBloomFilter(thread_index)); return impl_->BuildHashTable( thread_index, std::move(batches), [this](size_t thread_index) { return OnHashTableFinished(thread_index); }); @@ -836,10 +825,9 @@ class HashJoinNode : public ExecNode { return Status::OK(); } - Status OnFiltersReceived() { + Status OnFiltersReceived(size_t thread_index) { std::unique_lock guard(probe_side_mutex_); bloom_filters_ready_ = true; - size_t thread_index = plan_->GetThreadIndex(); AccumulationQueue batches = std::move(probe_accumulator_); guard.unlock(); return pushdown_context_.FilterBatches( @@ -868,8 +856,8 @@ class HashJoinNode : public ExecNode { std::lock_guard guard(probe_side_mutex_); queued_batches_to_probe_ = std::move(probe_accumulator_); } - return plan_->StartTaskGroup(task_group_probe_, - queued_batches_to_probe_.batch_count()); + return plan_->query_context()->StartTaskGroup(task_group_probe_, + queued_batches_to_probe_.batch_count()); } Status OnQueuedBatchesProbed(size_t thread_index) { @@ -890,7 +878,7 @@ class HashJoinNode : public ExecNode { return; } - size_t thread_index = plan_->GetThreadIndex(); + size_t thread_index = plan_->query_context()->GetThreadIndex(); int side = (input == inputs_[0]) ? 0 : 1; EVENT(span_, "InputReceived", {{"batch.length", batch.length}, {"side", side}}); @@ -928,7 +916,7 @@ class HashJoinNode : public ExecNode { void InputFinished(ExecNode* input, int total_batches) override { ARROW_DCHECK(std::find(inputs_.begin(), inputs_.end(), input) != inputs_.end()); - size_t thread_index = plan_->GetThreadIndex(); + size_t thread_index = plan_->query_context()->GetThreadIndex(); int side = (input == inputs_[0]) ? 0 : 1; EVENT(span_, "InputFinished", {{"side", side}, {"batches.length", total_batches}}); @@ -946,13 +934,14 @@ class HashJoinNode : public ExecNode { } Status Init() override { - RETURN_NOT_OK(ExecNode::Init()); - if (plan_->UseLegacyBatching()) { + QueryContext* ctx = plan_->query_context(); + if (ctx->options().use_legacy_batching) { return Status::Invalid( "The plan was configured to use legacy batching but contained a join node " "which is incompatible with legacy batching"); } - bool use_sync_execution = !(plan_->exec_context()->executor()); + + bool use_sync_execution = ctx->executor()->GetCapacity() == 1; // TODO(ARROW-15732) // Each side of join might have an IO thread being called from. Once this is fixed // we will change it back to just the CPU's thread pool capacity. @@ -960,32 +949,32 @@ class HashJoinNode : public ExecNode { pushdown_context_.Init( this, num_threads, - [this](std::function fn, - std::function on_finished) { - return plan_->RegisterTaskGroup(std::move(fn), std::move(on_finished)); + [ctx](std::function fn, + std::function on_finished) { + return ctx->RegisterTaskGroup(std::move(fn), std::move(on_finished)); }, - [this](int task_group_id, int64_t num_tasks) { - return plan_->StartTaskGroup(task_group_id, num_tasks); + [ctx](int task_group_id, int64_t num_tasks) { + return ctx->StartTaskGroup(task_group_id, num_tasks); }, - [this]() { return OnFiltersReceived(); }, disable_bloom_filter_, - use_sync_execution); + [this](size_t thread_index) { return OnFiltersReceived(thread_index); }, + disable_bloom_filter_, use_sync_execution); RETURN_NOT_OK(impl_->Init( - plan_->exec_context(), join_type_, num_threads, &(schema_mgr_->proj_maps[0]), + ctx, join_type_, num_threads, &(schema_mgr_->proj_maps[0]), &(schema_mgr_->proj_maps[1]), key_cmp_, filter_, - [this](std::function fn, - std::function on_finished) { - return plan_->RegisterTaskGroup(std::move(fn), std::move(on_finished)); + [ctx](std::function fn, + std::function on_finished) { + return ctx->RegisterTaskGroup(std::move(fn), std::move(on_finished)); }, - [this](int task_group_id, int64_t num_tasks) { - return plan_->StartTaskGroup(task_group_id, num_tasks); + [ctx](int task_group_id, int64_t num_tasks) { + return ctx->StartTaskGroup(task_group_id, num_tasks); }, [this](int64_t, ExecBatch batch) { this->OutputBatchCallback(batch); }, [this](int64_t total_num_batches) { this->FinishedCallback(total_num_batches); })); - task_group_probe_ = plan_->RegisterTaskGroup( + task_group_probe_ = ctx->RegisterTaskGroup( [this](size_t thread_index, int64_t task_id) -> Status { return impl_->ProbeSingleBatch(thread_index, std::move(queued_batches_to_probe_[task_id])); @@ -1003,7 +992,8 @@ class HashJoinNode : public ExecNode { {"node.detail", ToString()}, {"node.kind", kind_name()}}); END_SPAN_ON_FUTURE_COMPLETION(span_, finished_); - RETURN_NOT_OK(pushdown_context_.StartProducing()); + RETURN_NOT_OK( + pushdown_context_.StartProducing(plan_->query_context()->GetThreadIndex())); return Status::OK(); } @@ -1017,16 +1007,15 @@ class HashJoinNode : public ExecNode { void StopProducing(ExecNode* output) override { DCHECK_EQ(output, outputs_[0]); - StopProducing(); + for (auto&& input : inputs_) { + input->StopProducing(this); + } } void StopProducing() override { EVENT(span_, "StopProducing"); bool expected = false; if (complete_.compare_exchange_strong(expected, true)) { - for (auto&& input : inputs_) { - input->StopProducing(this); - } impl_->Abort([this]() { finished_.MarkFinished(); }); } } @@ -1083,14 +1072,13 @@ void BloomFilterPushdownContext::Init( FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter, bool use_sync_execution) { schema_mgr_ = owner->schema_mgr_.get(); - ctx_ = owner->plan_->exec_context(); - tld_.resize(num_threads); + ctx_ = owner->plan_->query_context(); disable_bloom_filter_ = disable_bloom_filter; std::tie(push_.pushdown_target_, push_.column_map_) = GetPushdownTarget(owner); eval_.all_received_callback_ = std::move(on_bloom_filters_received); if (!disable_bloom_filter_) { ARROW_CHECK(push_.pushdown_target_); - push_.bloom_filter_ = arrow::internal::make_unique(); + push_.bloom_filter_ = std::make_unique(); push_.pushdown_target_->pushdown_context_.ExpectBloomFilter(); build_.builder_ = BloomFilterBuilder::Make( @@ -1116,8 +1104,9 @@ void BloomFilterPushdownContext::Init( start_task_group_callback_ = std::move(start_task_group_callback); } -Status BloomFilterPushdownContext::StartProducing() { - if (eval_.num_expected_bloom_filters_ == 0) return eval_.all_received_callback_(); +Status BloomFilterPushdownContext::StartProducing(size_t thread_index) { + if (eval_.num_expected_bloom_filters_ == 0) + return eval_.all_received_callback_(thread_index); return Status::OK(); } @@ -1131,7 +1120,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter(size_t thread_index, return build_.on_finished_(thread_index, std::move(build_.batches_)); RETURN_NOT_OK(build_.builder_->Begin( - /*num_threads=*/tld_.size(), ctx_->cpu_info()->hardware_flags(), + /*num_threads=*/ctx_->max_concurrency(), ctx_->cpu_info()->hardware_flags(), ctx_->memory_pool(), build_.batches_.row_count(), build_.batches_.batch_count(), push_.bloom_filter_.get())); @@ -1139,10 +1128,10 @@ Status BloomFilterPushdownContext::BuildBloomFilter(size_t thread_index, /*num_tasks=*/build_.batches_.batch_count()); } -Status BloomFilterPushdownContext::PushBloomFilter() { +Status BloomFilterPushdownContext::PushBloomFilter(size_t thread_index) { if (!disable_bloom_filter_) return push_.pushdown_target_->pushdown_context_.ReceiveBloomFilter( - std::move(push_.bloom_filter_), std::move(push_.column_map_)); + thread_index, std::move(push_.bloom_filter_), std::move(push_.column_map_)); return Status::OK(); } @@ -1163,7 +1152,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter_exec_task(size_t thread_inde } ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(std::move(key_columns))); - ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * stack, GetStack(thread_index)); + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * stack, ctx_->GetTempStack(thread_index)); util::TempVectorHolder hash_holder(stack, util::MiniBatch::kMiniBatchLength); uint32_t* hashes = hash_holder.mutable_data(); for (int64_t i = 0; i < key_batch.length; i += util::MiniBatch::kMiniBatchLength) { diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc index 8cb1f8b92c0..cd8f392ad70 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc @@ -17,6 +17,7 @@ #include +#include #include #include #include @@ -27,11 +28,11 @@ #include "arrow/compute/exec/util.h" #include "arrow/compute/kernels/row_encoder.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" using testing::UnorderedElementsAreArray; @@ -41,7 +42,7 @@ namespace compute { BatchesWithSchema GenerateBatchesFromString( const std::shared_ptr& schema, - const std::vector& json_strings, int multiplicity = 1) { + const std::vector& json_strings, int multiplicity = 1) { BatchesWithSchema out_batches{{}, schema}; std::vector types; @@ -68,34 +69,20 @@ void CheckRunOutput(JoinType type, const BatchesWithSchema& l_batches, const std::vector& left_keys, const std::vector& right_keys, const BatchesWithSchema& exp_batches, bool parallel = false) { - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); - - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - + Declaration left{"source", + SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, + /*slow=*/false)}}; + Declaration right{"source", + SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, + /*slow=*/false)}}; HashJoinNodeOptions join_options{type, left_keys, right_keys}; - Declaration join{"hashjoin", join_options}; - - // add left source - join.inputs.emplace_back(Declaration{ - "source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, - /*slow=*/false)}}); - // add right source - join.inputs.emplace_back(Declaration{ - "source", SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, - /*slow=*/false)}}); - AsyncGenerator> sink_gen; - - ASSERT_OK(Declaration::Sequence({join, {"sink", SinkNodeOptions{&sink_gen}}}) - .AddToPlan(plan.get())); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_options}; - ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto out_table, DeclarationToTable(std::move(join), parallel)); ASSERT_OK_AND_ASSIGN(auto exp_table, TableFromExecBatches(exp_batches.schema, exp_batches.batches)); - ASSERT_OK_AND_ASSIGN(auto out_table, TableFromExecBatches(exp_batches.schema, res)); - if (exp_table->num_rows() == 0) { ASSERT_EQ(exp_table->num_rows(), out_table->num_rows()); } else { @@ -889,44 +876,21 @@ Result> HashJoinWithExecPlan( const std::shared_ptr& output_schema, const std::vector>& l, const std::vector>& r, int num_batches_l, int num_batches_r) { - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); - - ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(exec_ctx.get())); - // add left source BatchesWithSchema l_batches = TableToBatches(rng, num_batches_l, l, "l_"); - ARROW_ASSIGN_OR_RAISE( - ExecNode * l_source, - MakeExecNode("source", plan.get(), {}, + Declaration left{"source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, - /*slow=*/false)})); - + /*slow=*/false)}}; // add right source BatchesWithSchema r_batches = TableToBatches(rng, num_batches_r, r, "r_"); - ARROW_ASSIGN_OR_RAISE( - ExecNode * r_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, - /*slow=*/false)})); - - ARROW_ASSIGN_OR_RAISE( - ExecNode * join, - MakeExecNode("hashjoin", plan.get(), {l_source, r_source}, join_options)); - - AsyncGenerator> sink_gen; - ARROW_ASSIGN_OR_RAISE( - std::ignore, MakeExecNode("sink", plan.get(), {join}, SinkNodeOptions{&sink_gen})); - - auto batches_fut = StartAndCollect(plan.get(), sink_gen); - if (!batches_fut.Wait(::arrow::kDefaultAssertFinishesWaitSeconds)) { - plan->StopProducing(); - // If this second wait fails then there isn't much we can do. We will abort - // and probably get a segmentation fault. - plan->finished().Wait(::arrow::kDefaultAssertFinishesWaitSeconds); - return Status::Invalid("Plan did not finish in a reasonable amount of time"); - } - return batches_fut.result(); + Declaration right{"source", + SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, + /*slow=*/false)}}; + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_options}; + + ARROW_ASSIGN_OR_RAISE(BatchesWithCommonSchema batches_and_schema, + DeclarationToExecBatches(std::move(join), parallel)); + return batches_and_schema.batches; } TEST(HashJoin, Suffix) { @@ -960,40 +924,24 @@ TEST(HashJoin, Suffix) { field("ldistinct", int32()), field("rkey", int32()), field("shared_r", int32()), field("rdistinct", int32())}); - ExecContext exec_ctx; - - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx)); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; - ExecNode* left_source; - ExecNode* right_source; - ASSERT_OK_AND_ASSIGN( - left_source, - MakeExecNode("source", plan.get(), {}, + Declaration left{"source", SourceNodeOptions{input_left.schema, input_left.gen(/*parallel=*/false, - /*slow=*/false)})); - - ASSERT_OK_AND_ASSIGN(right_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_right.schema, - input_right.gen(/*parallel=*/false, - /*slow=*/false)})); - + /*slow=*/false)}}; + Declaration right{ + "source", SourceNodeOptions{input_right.schema, input_right.gen(/*parallel=*/false, + /*slow=*/false)}}; HashJoinNodeOptions join_opts{JoinType::INNER, /*left_keys=*/{"lkey"}, /*right_keys=*/{"rkey"}, literal(true), "_l", "_r"}; - ASSERT_OK_AND_ASSIGN( - auto hashjoin, - MakeExecNode("hashjoin", plan.get(), {left_source, right_source}, join_opts)); - - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin}, - SinkNodeOptions{&sink_gen})); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto actual, DeclarationToExecBatches(std::move(join))); - AssertExecBatchesEqual(expected.schema, expected.batches, result); - AssertSchemaEqual(expected.schema, hashjoin->output_schema()); + AssertExecBatchesEqualIgnoringOrder(expected.schema, expected.batches, actual.batches); + AssertSchemaEqual(expected.schema, actual.schema); } TEST(HashJoin, Random) { @@ -1008,7 +956,7 @@ TEST(HashJoin, Random) { for (int test_id = 0; test_id < num_tests; ++test_id) { bool parallel = (rng.from_range(0, 1) == 1); bool disable_bloom_filter = (rng.from_range(0, 1) == 1); - auto exec_ctx = arrow::internal::make_unique( + auto exec_ctx = std::make_unique( default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); // Constraints @@ -1185,7 +1133,7 @@ TEST(HashJoin, Random) { TableFromExecBatches(output_schema, batches)); // Compare results - AssertTablesEqual(output_rows_ref, output_rows_test); + AssertTablesEqualIgnoringOrder(output_rows_ref, output_rows_test); } } @@ -1309,19 +1257,13 @@ void TestHashJoinDictionaryHelper( r_batches.batches.resize(0); } - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - ASSERT_OK_AND_ASSIGN( - ExecNode * l_source, - MakeExecNode("source", plan.get(), {}, + Declaration left{"source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, - /*slow=*/false)})); - ASSERT_OK_AND_ASSIGN( - ExecNode * r_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, - /*slow=*/false)})); + /*slow=*/false)}}; + + Declaration right{"source", + SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, + /*slow=*/false)}}; HashJoinNodeOptions join_options{join_type, {FieldRef(swap_sides ? "r_key" : "l_key")}, {FieldRef(swap_sides ? "l_key" : "r_key")}, @@ -1330,23 +1272,18 @@ void TestHashJoinDictionaryHelper( {FieldRef(swap_sides ? "l_key" : "r_key"), FieldRef(swap_sides ? "l_payload" : "r_payload")}, {cmp}}; - ASSERT_OK_AND_ASSIGN(ExecNode * join, MakeExecNode("hashjoin", plan.get(), - {(swap_sides ? r_source : l_source), - (swap_sides ? l_source : r_source)}, - join_options)); - AsyncGenerator> sink_gen; - ASSERT_OK_AND_ASSIGN( - std::ignore, MakeExecNode("sink", plan.get(), {join}, SinkNodeOptions{&sink_gen})); - ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen)); + Declaration join{ + "hashjoin", {swap_sides ? right : left, swap_sides ? left : right}, join_options}; + ASSERT_OK_AND_ASSIGN(auto res, DeclarationToExecBatches(std::move(join), parallel)); - for (auto& batch : res) { - DecodeScalarsAndDictionariesInBatch(&batch, exec_ctx->memory_pool()); + for (auto& batch : res.batches) { + DecodeScalarsAndDictionariesInBatch(&batch, default_memory_pool()); } std::shared_ptr output_schema = - UpdateSchemaAfterDecodingDictionaries(join->output_schema()); + UpdateSchemaAfterDecodingDictionaries(res.schema); ASSERT_OK_AND_ASSIGN(std::shared_ptr
output, - TableFromExecBatches(output_schema, res)); + TableFromExecBatches(output_schema, res.batches)); ExecBatch expected_batch; if (swap_sides) { @@ -1357,7 +1294,7 @@ void TestHashJoinDictionaryHelper( r_out_key, r_out_payload})); } - DecodeScalarsAndDictionariesInBatch(&expected_batch, exec_ctx->memory_pool()); + DecodeScalarsAndDictionariesInBatch(&expected_batch, default_memory_pool()); // Slice expected batch into two to separate rows on right side with no matches from // everything else. @@ -1398,7 +1335,7 @@ void TestHashJoinDictionaryHelper( TableFromExecBatches(output_schema, expected_batches)); // Compare results - AssertTablesEqual(expected, output); + AssertTablesEqualIgnoringOrder(expected, output); } TEST(HashJoin, Dictionary) { @@ -1733,38 +1670,21 @@ TEST(HashJoin, DictNegative) { ExecBatch::Make({i == 2 ? datumSecondB : datumSecondA, i == 3 ? datumSecondB : datumSecondA})); - auto exec_ctx = - arrow::internal::make_unique(default_memory_pool(), nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - ASSERT_OK_AND_ASSIGN( - ExecNode * l_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{l.schema, l.gen(/*parallel=*/false, - /*slow=*/false)})); - ASSERT_OK_AND_ASSIGN( - ExecNode * r_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{r.schema, r.gen(/*parallel=*/false, - /*slow=*/false)})); + Declaration left{"source", SourceNodeOptions{l.schema, l.gen(/*parallel=*/false, + /*slow=*/false)}}; + Declaration right{"source", SourceNodeOptions{r.schema, r.gen(/*parallel=*/false, + /*slow=*/false)}}; HashJoinNodeOptions join_options{JoinType::INNER, {FieldRef("l_key")}, {FieldRef("r_key")}, {FieldRef("l_key"), FieldRef("l_payload")}, {FieldRef("r_key"), FieldRef("r_payload")}, {JoinKeyCmp::EQ}}; - ASSERT_OK_AND_ASSIGN( - ExecNode * join, - MakeExecNode("hashjoin", plan.get(), {l_source, r_source}, join_options)); - AsyncGenerator> sink_gen; - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {join}, - SinkNodeOptions{&sink_gen})); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_options}; - EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( + EXPECT_RAISES_WITH_MESSAGE_THAT( NotImplemented, ::testing::HasSubstr("Unifying differing dictionaries"), - StartAndCollect(plan.get(), sink_gen)); - // Since we returned an error, the StartAndCollect future may return before - // the plan is done finishing. - plan->finished().Wait(); + DeclarationToTable(std::move(join), /*use_threads=*/false)); } } @@ -1787,25 +1707,110 @@ TEST(HashJoin, UnsupportedTypes) { BatchesWithSchema l_batches = GenerateBatchesFromString(schemas.first, {R"([])"}); BatchesWithSchema r_batches = GenerateBatchesFromString(schemas.second, {R"([])"}); - ExecContext exec_ctx; - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx)); - HashJoinNodeOptions join_options{JoinType::LEFT_SEMI, l_keys, r_keys}; - Declaration join{"hashjoin", join_options}; - join.inputs.emplace_back(Declaration{ - "source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, slow)}}); - join.inputs.emplace_back(Declaration{ - "source", SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, slow)}}); + Declaration left{"source", + SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, slow)}}; + Declaration right{"source", + SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, slow)}}; + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_options}; - ASSERT_RAISES(Invalid, join.AddToPlan(plan.get())); + ASSERT_RAISES(Invalid, DeclarationToStatus(std::move(join))); } } -TEST(HashJoin, CheckHashJoinNodeOptionsValidation) { - auto exec_ctx = - arrow::internal::make_unique(default_memory_pool(), nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); +void TestSimpleJoinHelper(BatchesWithSchema input_left, BatchesWithSchema input_right, + BatchesWithSchema expected) { + AsyncGenerator> sink_gen; + + Declaration left{"source", + SourceNodeOptions{input_left.schema, input_left.gen(/*parallel=*/false, + /*slow=*/false)}}; + Declaration right{ + "source", SourceNodeOptions{input_right.schema, input_right.gen(/*parallel=*/false, + /*slow=*/false)}}; + + HashJoinNodeOptions join_opts{JoinType::INNER, + /*left_keys=*/{"lkey"}, + /*right_keys=*/{"rkey"}, literal(true), "_l", "_r"}; + + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(join))); + + ASSERT_OK_AND_ASSIGN(auto output_rows_test, + TableFromExecBatches(result.schema, result.batches)); + ASSERT_OK_AND_ASSIGN(auto expected_rows_test, + TableFromExecBatches(expected.schema, expected.batches)); + + AssertTablesEqual(*output_rows_test, *expected_rows_test, /*same_chunk_layout=*/false, + /*flatten=*/true); + AssertSchemaEqual(expected.schema, result.schema); +} + +TEST(HashJoin, ExtensionTypesSwissJoin) { + // For simpler types swiss join will be used. + auto ext_arr = ExampleUuid(); + auto l_int_arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); + auto l_int_arr2 = ArrayFromJSON(int32(), "[4, 5, 6, 7]"); + auto r_int_arr = ArrayFromJSON(int32(), "[4, 3, 2, null, 1]"); + BatchesWithSchema input_left; + ASSERT_OK_AND_ASSIGN(ExecBatch left_batches, + ExecBatch::Make({l_int_arr, l_int_arr2, ext_arr})); + input_left.batches = {left_batches}; + input_left.schema = schema( + {field("lkey", int32()), field("shared", int32()), field("ldistinct", uuid())}); + + BatchesWithSchema input_right; + ASSERT_OK_AND_ASSIGN(ExecBatch right_batches, ExecBatch::Make({r_int_arr})); + input_right.batches = {right_batches}; + input_right.schema = schema({field("rkey", int32())}); + + BatchesWithSchema expected; + ASSERT_OK_AND_ASSIGN(ExecBatch expected_batches, + ExecBatch::Make({l_int_arr, l_int_arr2, ext_arr, l_int_arr})); + expected.batches = {expected_batches}; + expected.schema = schema({field("lkey", int32()), field("shared", int32()), + field("ldistinct", uuid()), field("rkey", int32())}); + + TestSimpleJoinHelper(input_left, input_right, expected); +} + +TEST(HashJoin, ExtensionTypesHashJoin) { + // Swiss join doesn't support dictionaries so HashJoin will be used. + auto dict_type = dictionary(int64(), int8()); + auto ext_arr = ExampleUuid(); + auto l_int_arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); + auto l_int_arr2 = ArrayFromJSON(int32(), "[4, 5, 6, 7]"); + auto r_int_arr = ArrayFromJSON(int32(), "[4, 3, 2, null, 1]"); + auto l_dict_array = + DictArrayFromJSON(dict_type, R"([2, 0, 1, null])", R"([null, 0, 1])"); + + BatchesWithSchema input_left; + ASSERT_OK_AND_ASSIGN(ExecBatch left_batches, + ExecBatch::Make({l_int_arr, l_int_arr2, ext_arr, l_dict_array})); + input_left.batches = {left_batches}; + input_left.schema = schema({field("lkey", int32()), field("shared", int32()), + field("ldistinct", uuid()), field("dict_type", dict_type)}); + + BatchesWithSchema input_right; + ASSERT_OK_AND_ASSIGN(ExecBatch right_batches, ExecBatch::Make({r_int_arr})); + input_right.batches = {right_batches}; + input_right.schema = schema({field("rkey", int32())}); + + BatchesWithSchema expected; + ASSERT_OK_AND_ASSIGN( + ExecBatch expected_batches, + ExecBatch::Make({l_int_arr, l_int_arr2, ext_arr, l_dict_array, l_int_arr})); + expected.batches = {expected_batches}; + expected.schema = schema({field("lkey", int32()), field("shared", int32()), + field("ldistinct", uuid()), field("dict_type", dict_type), + field("rkey", int32())}); + + TestSimpleJoinHelper(input_left, input_right, expected); +} + +TEST(HashJoin, CheckHashJoinNodeOptionsValidation) { BatchesWithSchema input_left; input_left.batches = {ExecBatchFromJSON({int32(), int32(), int32()}, R"([ [1, 4, 7], @@ -1824,19 +1829,12 @@ TEST(HashJoin, CheckHashJoinNodeOptionsValidation) { input_right.schema = schema( {field("rkey", int32()), field("shared", int32()), field("rdistinct", int32())}); - ExecNode* l_source; - ExecNode* r_source; - ASSERT_OK_AND_ASSIGN( - l_source, - MakeExecNode("source", plan.get(), {}, + Declaration left{"source", SourceNodeOptions{input_left.schema, input_left.gen(/*parallel=*/false, - /*slow=*/false)})); - - ASSERT_OK_AND_ASSIGN(r_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_right.schema, - input_right.gen(/*parallel=*/false, - /*slow=*/false)})) + /*slow=*/false)}}; + Declaration right{ + "source", SourceNodeOptions{input_right.schema, input_right.gen(/*parallel=*/false, + /*slow=*/false)}}; std::vector> l_keys = { {}, @@ -1858,9 +1856,9 @@ TEST(HashJoin, CheckHashJoinNodeOptionsValidation) { HashJoinNodeOptions options{JoinType::INNER, l_keys[j], r_keys[k], {}, {}, key_cmps[i]}; - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("key_cmp and keys"), - MakeExecNode("hashjoin", plan.get(), {l_source, r_source}, options)); + Declaration join{"hashjoin", {left, right}, options}; + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("key_cmp and keys"), + DeclarationToStatus(std::move(join))); } } } @@ -1888,25 +1886,12 @@ TEST(HashJoin, ResidualFilter) { input_right.schema = schema({field("r1", int32()), field("r2", int32()), field("r_str", utf8())}); - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); - - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - AsyncGenerator> sink_gen; - - ExecNode* left_source; - ExecNode* right_source; - ASSERT_OK_AND_ASSIGN( - left_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_left.schema, - input_left.gen(parallel, /*slow=*/false)})); - - ASSERT_OK_AND_ASSIGN( - right_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_right.schema, - input_right.gen(parallel, /*slow=*/false)})) + Declaration left{ + "source", + SourceNodeOptions{input_left.schema, input_left.gen(parallel, /*slow=*/false)}}; + Declaration right{ + "source", + SourceNodeOptions{input_right.schema, input_right.gen(parallel, /*slow=*/false)}}; Expression mul = call("multiply", {field_ref("l1"), field_ref("l2")}); Expression combination = call("add", {mul, field_ref("r1")}); @@ -1917,14 +1902,10 @@ TEST(HashJoin, ResidualFilter) { /*left_keys=*/{"l_str"}, /*right_keys=*/{"r_str"}, std::move(residual_filter), "l_", "r_"}; - ASSERT_OK_AND_ASSIGN( - auto hashjoin, - MakeExecNode("hashjoin", plan.get(), {left_source, right_source}, join_opts)); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin}, - SinkNodeOptions{&sink_gen})); - - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto result, + DeclarationToExecBatches(std::move(join), parallel)); std::vector expected = { ExecBatchFromJSON({int32(), int32(), utf8(), int32(), int32(), utf8()}, R"([ @@ -1933,7 +1914,7 @@ TEST(HashJoin, ResidualFilter) { [2, 5, "beta", 2, 12, "beta"], [3, 4, "alpha", 4, 16, "alpha"]])")}; - AssertExecBatchesEqual(hashjoin->output_schema(), result, expected); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); } } @@ -1965,45 +1946,31 @@ TEST(HashJoin, TrivialResidualFilter) { ])")}; input_right.schema = schema({field("r1", int32()), field("r_str", utf8())}); - auto exec_ctx = arrow::internal::make_unique( + auto exec_ctx = std::make_unique( default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - AsyncGenerator> sink_gen; - - ExecNode* left_source; - ExecNode* right_source; - ASSERT_OK_AND_ASSIGN( - left_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_left.schema, - input_left.gen(parallel, /*slow=*/false)})); - - ASSERT_OK_AND_ASSIGN( - right_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_right.schema, - input_right.gen(parallel, /*slow=*/false)})) + Declaration left{ + "source", + SourceNodeOptions{input_left.schema, input_left.gen(parallel, /*slow=*/false)}}; + Declaration right{"source", + SourceNodeOptions{input_right.schema, + input_right.gen(parallel, /*slow=*/false)}}; HashJoinNodeOptions join_opts{ JoinType::INNER, /*left_keys=*/{"l_str"}, /*right_keys=*/{"r_str"}, filters[test_id], "l_", "r_"}; - ASSERT_OK_AND_ASSIGN( - auto hashjoin, - MakeExecNode("hashjoin", plan.get(), {left_source, right_source}, join_opts)); - - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin}, - SinkNodeOptions{&sink_gen})); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto result, + DeclarationToExecBatches(std::move(join), parallel)); std::vector expected = {ExecBatchFromJSON( {int32(), utf8(), int32(), utf8()}, expected_strings[test_id])}; - AssertExecBatchesEqual(hashjoin->output_schema(), result, expected); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); } } } @@ -2105,42 +2072,32 @@ void TestSingleChainOfHashJoins(Random64Bit& rng) { for (bool bloom_filters : {false, true}) { bool kParallel = true; ARROW_SCOPED_TRACE(bloom_filters ? "bloom filtered" : "unfiltered"); - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), kParallel ? arrow::internal::GetCpuThreadPool() : nullptr); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - ExecNode* left_source; - ASSERT_OK_AND_ASSIGN( - left_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_left.schema, - input_left.gen(kParallel, /*slow=*/false)})); - std::vector joins(num_joins); + Declaration left{ + "source", + SourceNodeOptions{input_left.schema, input_left.gen(kParallel, /*slow=*/false)}}; + + Declaration last_join; for (int i = 0; i < num_joins; i++) { opts[i].disable_bloom_filter = !bloom_filters; - ExecNode* right_source; - ASSERT_OK_AND_ASSIGN( - right_source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input_right[i].schema, - input_right[i].gen(kParallel, /*slow=*/false)})); + Declaration right{"source", + SourceNodeOptions{input_right[i].schema, + input_right[i].gen(kParallel, /*slow=*/false)}}; - std::vector inputs; + std::vector inputs; if (i == 0) - inputs = {left_source, right_source}; + inputs = {std::move(left), std::move(right)}; else - inputs = {joins[i - 1], right_source}; - ASSERT_OK_AND_ASSIGN(joins[i], - MakeExecNode("hashjoin", plan.get(), inputs, opts[i])); + inputs = {std::move(last_join), std::move(right)}; + last_join = Declaration{"hashjoin", std::move(inputs), opts[i]}; } - AsyncGenerator> sink_gen; - ASSERT_OK( - MakeExecNode("sink", plan.get(), {joins.back()}, SinkNodeOptions{&sink_gen})); - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + + ASSERT_OK_AND_ASSIGN(auto result, + DeclarationToExecBatches(std::move(last_join), kParallel)); if (!bloom_filters) - reference = std::move(result); + reference = std::move(result.batches); else - AssertExecBatchesEqual(joins.back()->output_schema(), reference, result); + AssertExecBatchesEqualIgnoringOrder(result.schema, reference, result.batches); } } diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/exec/key_hash.cc index 3f495bc9e60..5ff0d4cf1e5 100644 --- a/cpp/src/arrow/compute/exec/key_hash.cc +++ b/cpp/src/arrow/compute/exec/key_hash.cc @@ -432,11 +432,14 @@ void Hashing32::HashMultiColumn(const std::vector& cols, cols[icol].data(1) + first_row * col_width, hashes + first_row, hash_temp); } - } else { - // TODO: add support for 64-bit offsets + } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) { HashVarLen(ctx->hardware_flags, icol > 0, batch_size_next, cols[icol].offsets() + first_row, cols[icol].data(2), hashes + first_row, hash_temp); + } else { + HashVarLen(ctx->hardware_flags, icol > 0, batch_size_next, + cols[icol].large_offsets() + first_row, cols[icol].data(2), + hashes + first_row, hash_temp); } // Zero hash for nulls @@ -865,10 +868,12 @@ void Hashing64::HashMultiColumn(const std::vector& cols, HashFixed(icol > 0, batch_size_next, col_width, cols[icol].data(1) + first_row * col_width, hashes + first_row); } - } else { - // TODO: add support for 64-bit offsets + } else if (cols[icol].metadata().fixed_length == sizeof(uint32_t)) { HashVarLen(icol > 0, batch_size_next, cols[icol].offsets() + first_row, cols[icol].data(2), hashes + first_row); + } else { + HashVarLen(icol > 0, batch_size_next, cols[icol].large_offsets() + first_row, + cols[icol].data(2), hashes + first_row); } // Zero hash for nulls diff --git a/cpp/src/arrow/compute/exec/key_hash_test.cc b/cpp/src/arrow/compute/exec/key_hash_test.cc index b9404ed9ae3..a4900b39a2d 100644 --- a/cpp/src/arrow/compute/exec/key_hash_test.cc +++ b/cpp/src/arrow/compute/exec/key_hash_test.cc @@ -35,9 +35,12 @@ namespace compute { class TestVectorHash { private: - static Result> GenerateUniqueRandomBinary( - random::pcg32_fast* random, int num, int min_length, int max_length) { - BinaryBuilder builder; + template ::ArrayType> + static enable_if_base_binary>> + GenerateUniqueRandomBinary(random::pcg32_fast* random, int num, int min_length, + int max_length) { + using BuilderType = typename TypeTraits::BuilderType; + BuilderType builder; std::unordered_set unique_key_strings; std::vector temp_buffer; temp_buffer.resize(max_length); @@ -58,12 +61,14 @@ class TestVectorHash { } } ARROW_ASSIGN_OR_RAISE(auto uniques, builder.Finish()); - return checked_pointer_cast(uniques); + return checked_pointer_cast(uniques); } - static Result, std::shared_ptr>> - SampleUniqueBinary(random::pcg32_fast* random, int num, const BinaryArray& uniques) { - BinaryBuilder builder; + template ::ArrayType> + static Result, std::shared_ptr>> + SampleUniqueBinary(random::pcg32_fast* random, int num, const ArrayType& uniques) { + using BuilderType = typename TypeTraits::BuilderType; + BuilderType builder; std::vector row_ids; row_ids.resize(num); @@ -75,13 +80,18 @@ class TestVectorHash { ARROW_RETURN_NOT_OK(builder.Append(uniques.GetView(row_id))); } ARROW_ASSIGN_OR_RAISE(std::shared_ptr sampled, builder.Finish()); - return std::pair, std::shared_ptr>{ - std::move(row_ids), checked_pointer_cast(sampled)}; + return std::pair, std::shared_ptr>{ + std::move(row_ids), checked_pointer_cast(sampled)}; } public: + template static void RunSingle(random::pcg32_fast* random, bool use_32bit_hash, bool use_varlen_input, int min_length, int max_length) { + using ArrayType = typename TypeTraits::ArrayType; + using OffsetType = typename TypeTraits::OffsetType; + using offset_t = typename std::make_unsigned::type; + constexpr int min_num_unique = 100; constexpr int max_num_unique = 1000; constexpr int min_num_rows = 4000; @@ -111,14 +121,15 @@ class TestVectorHash { } ASSERT_OK_AND_ASSIGN( - std::shared_ptr uniques, - GenerateUniqueRandomBinary(random, num_unique, min_length, max_length)); - ASSERT_OK_AND_ASSIGN(auto sampled, SampleUniqueBinary(random, num_rows, *uniques)); + std::shared_ptr uniques, + GenerateUniqueRandomBinary(random, num_unique, min_length, max_length)); + ASSERT_OK_AND_ASSIGN(auto sampled, + SampleUniqueBinary(random, num_rows, *uniques)); const std::vector& row_ids = sampled.first; - const std::shared_ptr& keys_array = sampled.second; + const std::shared_ptr& keys_array = sampled.second; const uint8_t* keys = keys_array->raw_data(); - const uint32_t* key_offsets = - reinterpret_cast(keys_array->raw_value_offsets()); + const offset_t* key_offsets = + reinterpret_cast(keys_array->raw_value_offsets()); std::vector hashes_scalar32; std::vector hashes_scalar64; @@ -208,7 +219,8 @@ class TestVectorHash { } }; -TEST(VectorHash, Basic) { +template +void RunTestVectorHash() { random::pcg32_fast gen(/*seed=*/0); int numtest = 40; @@ -219,12 +231,20 @@ TEST(VectorHash, Basic) { for (bool use_32bit_hash : {true, false}) { for (bool use_varlen_input : {false, true}) { for (int itest = 0; itest < numtest; ++itest) { - TestVectorHash::RunSingle(&gen, use_32bit_hash, use_varlen_input, min_length, - max_length); + TestVectorHash::RunSingle(&gen, use_32bit_hash, use_varlen_input, + min_length, max_length); } } } } +TEST(VectorHash, BasicBinary) { RunTestVectorHash(); } + +TEST(VectorHash, BasicLargeBinary) { RunTestVectorHash(); } + +TEST(VectorHash, BasicString) { RunTestVectorHash(); } + +TEST(VectorHash, BasicLargeString) { RunTestVectorHash(); } + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/map_node.cc b/cpp/src/arrow/compute/exec/map_node.cc new file mode 100644 index 00000000000..16201ea1290 --- /dev/null +++ b/cpp/src/arrow/compute/exec/map_node.cc @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/map_node.h" + +#include +#include +#include +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" +#include "arrow/util/tracing_internal.h" + +namespace arrow { +namespace compute { + +MapNode::MapNode(ExecPlan* plan, std::vector inputs, + std::shared_ptr output_schema) + : ExecNode(plan, std::move(inputs), /*input_labels=*/{"target"}, + std::move(output_schema), + /*num_outputs=*/1) {} + +void MapNode::ErrorReceived(ExecNode* input, Status error) { + DCHECK_EQ(input, inputs_[0]); + EVENT(span_, "ErrorReceived", {{"error.message", error.message()}}); + outputs_[0]->ErrorReceived(this, std::move(error)); +} + +void MapNode::InputFinished(ExecNode* input, int total_batches) { + DCHECK_EQ(input, inputs_[0]); + EVENT(span_, "InputFinished", {{"batches.length", total_batches}}); + outputs_[0]->InputFinished(this, total_batches); + if (input_counter_.SetTotal(total_batches)) { + this->Finish(); + } +} + +Status MapNode::StartProducing() { + START_COMPUTE_SPAN( + span_, std::string(kind_name()) + ":" + label(), + {{"node.label", label()}, {"node.detail", ToString()}, {"node.kind", kind_name()}}); + return Status::OK(); +} + +void MapNode::PauseProducing(ExecNode* output, int32_t counter) { + inputs_[0]->PauseProducing(this, counter); +} + +void MapNode::ResumeProducing(ExecNode* output, int32_t counter) { + inputs_[0]->ResumeProducing(this, counter); +} + +void MapNode::StopProducing(ExecNode* output) { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); +} + +void MapNode::StopProducing() { + EVENT(span_, "StopProducing"); + if (input_counter_.Cancel()) { + this->Finish(); + } + inputs_[0]->StopProducing(this); +} + +void MapNode::SubmitTask(std::function(ExecBatch)> map_fn, + ExecBatch batch) { + Status status; + // This will be true if the node is stopped early due to an error or manual + // cancellation + if (input_counter_.Completed()) { + return; + } + auto task = [this, map_fn, batch]() { + auto guarantee = batch.guarantee; + auto output_batch = map_fn(std::move(batch)); + if (ErrorIfNotOk(output_batch.status())) { + return output_batch.status(); + } + output_batch->guarantee = guarantee; + outputs_[0]->InputReceived(this, output_batch.MoveValueUnsafe()); + return Status::OK(); + }; + + status = task(); + if (!status.ok()) { + if (input_counter_.Cancel()) { + this->Finish(status); + } + inputs_[0]->StopProducing(this); + return; + } + if (input_counter_.Increment()) { + this->Finish(); + } +} + +void MapNode::Finish(Status finish_st /*= Status::OK()*/) { + this->finished_.MarkFinished(finish_st); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/map_node.h b/cpp/src/arrow/compute/exec/map_node.h new file mode 100644 index 00000000000..88241ece592 --- /dev/null +++ b/cpp/src/arrow/compute/exec/map_node.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// \brief MapNode is an ExecNode type class which process a task like filter/project +/// (See SubmitTask method) to each given ExecBatch object, which have one input, one +/// output, and are pure functions on the input +/// +/// A simple parallel runner is created with a "map_fn" which is just a function that +/// takes a batch in and returns a batch. This simple parallel runner also needs an +/// executor (use simple synchronous runner if there is no executor) + +#pragma once + +#include +#include +#include +#include + +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/util.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/cancel.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class ARROW_EXPORT MapNode : public ExecNode { + public: + MapNode(ExecPlan* plan, std::vector inputs, + std::shared_ptr output_schema); + + void ErrorReceived(ExecNode* input, Status error) override; + + void InputFinished(ExecNode* input, int total_batches) override; + + Status StartProducing() override; + + void PauseProducing(ExecNode* output, int32_t counter) override; + + void ResumeProducing(ExecNode* output, int32_t counter) override; + + void StopProducing(ExecNode* output) override; + + void StopProducing() override; + + protected: + void SubmitTask(std::function(ExecBatch)> map_fn, ExecBatch batch); + + virtual void Finish(Status finish_st = Status::OK()); + + protected: + // Counter for the number of batches received + AtomicCounter input_counter_; +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/options.cc b/cpp/src/arrow/compute/exec/options.cc index c09ab1c1b68..9e9da7ad831 100644 --- a/cpp/src/arrow/compute/exec/options.cc +++ b/cpp/src/arrow/compute/exec/options.cc @@ -25,6 +25,8 @@ namespace arrow { namespace compute { +constexpr int64_t TableSourceNodeOptions::kDefaultMaxBatchSize; + std::string ToString(JoinType t) { switch (t) { case JoinType::LEFT_SEMI: @@ -49,16 +51,26 @@ std::string ToString(JoinType t) { } Result> SourceNodeOptions::FromTable( - const Table& table, arrow::internal::Executor* exc) { + const Table& table, arrow::internal::Executor* executor) { std::shared_ptr reader = std::make_shared(table); - if (exc == nullptr) return Status::TypeError("No executor provided."); + if (executor == nullptr) return Status::TypeError("No executor provided."); + + // Map the RecordBatchReader to a SourceNode + ARROW_ASSIGN_OR_RAISE(auto batch_gen, MakeReaderGenerator(std::move(reader), executor)); + + return std::make_shared(table.schema(), batch_gen); +} + +Result> SourceNodeOptions::FromRecordBatchReader( + std::shared_ptr reader, std::shared_ptr schema, + arrow::internal::Executor* executor) { + if (executor == nullptr) return Status::TypeError("No executor provided."); // Map the RecordBatchReader to a SourceNode - ARROW_ASSIGN_OR_RAISE(auto batch_gen, MakeReaderGenerator(std::move(reader), exc)); + ARROW_ASSIGN_OR_RAISE(auto batch_gen, MakeReaderGenerator(std::move(reader), executor)); - return std::shared_ptr( - new SourceNodeOptions(table.schema(), batch_gen)); + return std::make_shared(std::move(schema), std::move(batch_gen)); } } // namespace compute diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h index 4a0cd602efb..0ef75cbedcf 100644 --- a/cpp/src/arrow/compute/exec/options.h +++ b/cpp/src/arrow/compute/exec/options.h @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,16 +27,23 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec/expression.h" +#include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/async_generator.h" #include "arrow/util/async_util.h" -#include "arrow/util/optional.h" #include "arrow/util/visibility.h" namespace arrow { + +namespace internal { + +class Executor; + +} // namespace internal + namespace compute { -using AsyncExecBatchGenerator = AsyncGenerator>; +using AsyncExecBatchGenerator = AsyncGenerator>; /// \addtogroup execnode-options /// @{ @@ -51,20 +59,26 @@ class ARROW_EXPORT ExecNodeOptions { class ARROW_EXPORT SourceNodeOptions : public ExecNodeOptions { public: SourceNodeOptions(std::shared_ptr output_schema, - std::function>()> generator) + std::function>()> generator) : output_schema(std::move(output_schema)), generator(std::move(generator)) {} static Result> FromTable(const Table& table, arrow::internal::Executor*); + static Result> FromRecordBatchReader( + std::shared_ptr reader, std::shared_ptr schema, + arrow::internal::Executor*); + std::shared_ptr output_schema; - std::function>()> generator; + std::function>()> generator; }; /// \brief An extended Source node which accepts a table class ARROW_EXPORT TableSourceNodeOptions : public ExecNodeOptions { public: - TableSourceNodeOptions(std::shared_ptr
table, int64_t max_batch_size) + static constexpr int64_t kDefaultMaxBatchSize = 1 << 20; + TableSourceNodeOptions(std::shared_ptr
table, + int64_t max_batch_size = kDefaultMaxBatchSize) : table(table), max_batch_size(max_batch_size) {} // arrow table which acts as the data source @@ -75,6 +89,79 @@ class ARROW_EXPORT TableSourceNodeOptions : public ExecNodeOptions { int64_t max_batch_size; }; +/// \brief Define a lazy resolved Arrow table. +/// +/// The table uniquely identified by the names can typically be resolved at the time when +/// the plan is to be consumed. +/// +/// This node is for serialization purposes only and can never be executed. +class ARROW_EXPORT NamedTableNodeOptions : public ExecNodeOptions { + public: + NamedTableNodeOptions(std::vector names, std::shared_ptr schema) + : names(std::move(names)), schema(schema) {} + + std::vector names; + std::shared_ptr schema; +}; + +/// \brief An extended Source node which accepts a schema +/// +/// ItMaker is a maker of an iterator of tabular data. +template +class ARROW_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions { + public: + SchemaSourceNodeOptions(std::shared_ptr schema, ItMaker it_maker, + arrow::internal::Executor* io_executor = NULLPTR) + : schema(schema), it_maker(std::move(it_maker)), io_executor(io_executor) {} + + /// \brief The schema of the record batches from the iterator + std::shared_ptr schema; + + /// \brief A maker of an iterator which acts as the data source + ItMaker it_maker; + + /// \brief The executor to use for scanning the iterator + /// + /// Defaults to the default I/O executor. + arrow::internal::Executor* io_executor; +}; + +class ARROW_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOptions { + public: + RecordBatchReaderSourceNodeOptions(std::shared_ptr reader, + arrow::internal::Executor* io_executor = NULLPTR) + : reader(std::move(reader)), io_executor(io_executor) {} + + /// \brief The RecordBatchReader which acts as the data source + std::shared_ptr reader; + + /// \brief The executor to use for the reader + /// + /// Defaults to the default I/O executor. + arrow::internal::Executor* io_executor; +}; + +using ArrayVectorIteratorMaker = std::function>()>; +/// \brief An extended Source node which accepts a schema and array-vectors +class ARROW_EXPORT ArrayVectorSourceNodeOptions + : public SchemaSourceNodeOptions { + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; +}; + +using ExecBatchIteratorMaker = std::function>()>; +/// \brief An extended Source node which accepts a schema and exec-batches +class ARROW_EXPORT ExecBatchSourceNodeOptions + : public SchemaSourceNodeOptions { + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; +}; + +using RecordBatchIteratorMaker = std::function>()>; +/// \brief An extended Source node which accepts a schema and record-batches +class ARROW_EXPORT RecordBatchSourceNodeOptions + : public SchemaSourceNodeOptions { + using SchemaSourceNodeOptions::SchemaSourceNodeOptions; +}; + /// \brief Make a node which excludes some rows from batches passed through it /// /// filter_expression will be evaluated against each batch which is pushed to @@ -82,11 +169,10 @@ class ARROW_EXPORT TableSourceNodeOptions : public ExecNodeOptions { /// excluded in the batch emitted by this node. class ARROW_EXPORT FilterNodeOptions : public ExecNodeOptions { public: - explicit FilterNodeOptions(Expression filter_expression, bool async_mode = true) - : filter_expression(std::move(filter_expression)), async_mode(async_mode) {} + explicit FilterNodeOptions(Expression filter_expression) + : filter_expression(std::move(filter_expression)) {} Expression filter_expression; - bool async_mode; }; /// \brief Make a node which executes expressions on input batches, producing new batches. @@ -98,14 +184,11 @@ class ARROW_EXPORT FilterNodeOptions : public ExecNodeOptions { class ARROW_EXPORT ProjectNodeOptions : public ExecNodeOptions { public: explicit ProjectNodeOptions(std::vector expressions, - std::vector names = {}, bool async_mode = true) - : expressions(std::move(expressions)), - names(std::move(names)), - async_mode(async_mode) {} + std::vector names = {}) + : expressions(std::move(expressions)), names(std::move(names)) {} std::vector expressions; std::vector names; - bool async_mode; }; /// \brief Make a node which aggregates input batches, optionally grouped by keys. @@ -131,8 +214,8 @@ constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28; // 256MiB class ARROW_EXPORT BackpressureMonitor { public: virtual ~BackpressureMonitor() = default; - virtual uint64_t bytes_in_use() const = 0; - virtual bool is_paused() const = 0; + virtual uint64_t bytes_in_use() = 0; + virtual bool is_paused() = 0; }; /// \brief Options to control backpressure behavior @@ -145,7 +228,7 @@ struct ARROW_EXPORT BackpressureOptions { /// queue has fewer than resume_if_below items. /// \param pause_if_above The producer should pause producing if the backpressure /// queue has more than pause_if_above items - BackpressureOptions(uint32_t resume_if_below, uint32_t pause_if_above) + BackpressureOptions(uint64_t resume_if_below, uint64_t pause_if_above) : resume_if_below(resume_if_below), pause_if_above(pause_if_above) {} static BackpressureOptions DefaultBackpressure() { @@ -164,10 +247,20 @@ struct ARROW_EXPORT BackpressureOptions { /// Emitted batches will not be ordered. class ARROW_EXPORT SinkNodeOptions : public ExecNodeOptions { public: - explicit SinkNodeOptions(std::function>()>* generator, + explicit SinkNodeOptions(std::function>()>* generator, + std::shared_ptr* schema, + BackpressureOptions backpressure = {}, + BackpressureMonitor** backpressure_monitor = NULLPTR) + : generator(generator), + schema(schema), + backpressure(backpressure), + backpressure_monitor(backpressure_monitor) {} + + explicit SinkNodeOptions(std::function>()>* generator, BackpressureOptions backpressure = {}, BackpressureMonitor** backpressure_monitor = NULLPTR) : generator(generator), + schema(NULLPTR), backpressure(std::move(backpressure)), backpressure_monitor(backpressure_monitor) {} @@ -176,7 +269,12 @@ class ARROW_EXPORT SinkNodeOptions : public ExecNodeOptions { /// This will be set when the node is added to the plan and should be used to consume /// data from the plan. If this function is not called frequently enough then the sink /// node will start to accumulate data and may apply backpressure. - std::function>()>* generator; + std::function>()>* generator; + /// \brief A pointer which will be set to the schema of the generated batches + /// + /// This is optional, if nullptr is passed in then it will be ignored. + /// This will be set when the node is added to the plan, before StartProducing is called + std::shared_ptr* schema; /// \brief Options to control when to apply backpressure /// /// This is optional, the default is to never apply backpressure. If the plan is not @@ -215,8 +313,9 @@ class ARROW_EXPORT SinkNodeConsumer { /// This will be run once the schema is finalized as the plan is starting and /// before any calls to Consume. A common use is to save off the schema so that /// batches can be interpreted. + /// TODO(ARROW-17837) Move ExecPlan* plan to query context virtual Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) = 0; + BackpressureControl* backpressure_control, ExecPlan* plan) = 0; /// \brief Consume a batch of data virtual Status Consume(ExecBatch batch) = 0; /// \brief Signal to the consumer that the last batch has been delivered @@ -248,7 +347,7 @@ class ARROW_EXPORT OrderBySinkNodeOptions : public SinkNodeOptions { public: explicit OrderBySinkNodeOptions( SortOptions sort_options, - std::function>()>* generator) + std::function>()>* generator) : SinkNodeOptions(generator), sort_options(std::move(sort_options)) {} SortOptions sort_options; @@ -395,23 +494,38 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions { /// This node will output one row for each row in the left table. class ARROW_EXPORT AsofJoinNodeOptions : public ExecNodeOptions { public: - AsofJoinNodeOptions(FieldRef on_key, FieldRef by_key, int64_t tolerance) - : on_key(std::move(on_key)), by_key(std::move(by_key)), tolerance(tolerance) {} - - /// \brief "on" key for the join. Each + /// \brief Keys for one input table of the AsofJoin operation + /// + /// The keys must be consistent across the input tables: + /// Each "on" key must refer to a field of the same type and units across the tables. + /// Each "by" key must refer to a list of fields of the same types across the tables. + struct Keys { + /// \brief "on" key for the join. + /// + /// The input table must be sorted by the "on" key. Must be a single field of a common + /// type. Inexact match is used on the "on" key. i.e., a row is considered a match iff + /// left_on - tolerance <= right_on <= left_on. + /// Currently, the "on" key must be of an integer, date, or timestamp type. + FieldRef on_key; + /// \brief "by" key for the join. + /// + /// Each input table must have each field of the "by" key. Exact equality is used for + /// each field of the "by" key. + /// Currently, each field of the "by" key must be of an integer, date, timestamp, or + /// base-binary type. + std::vector by_key; + }; + + AsofJoinNodeOptions(std::vector input_keys, int64_t tolerance) + : input_keys(std::move(input_keys)), tolerance(tolerance) {} + + /// \brief AsofJoin keys per input table. /// - /// All inputs tables must be sorted by the "on" key. Inexact - /// match is used on the "on" key. i.e., a row is considiered match iff - /// left_on - tolerance <= right_on <= left_on. - /// Currently, "on" key must be an int64 field - FieldRef on_key; - /// \brief "by" key for the join. + /// \see `Keys` for details. + std::vector input_keys; + /// \brief Tolerance for inexact "on" key matching. Must be non-negative. /// - /// All input tables must have the "by" key. Exact equality - /// is used for the "by" key. - /// Currently, the "by" key must be an int32 field - FieldRef by_key; - /// Tolerance for inexact "on" key matching + /// The tolerance is interpreted in the same units as the "on" key. int64_t tolerance; }; @@ -423,7 +537,7 @@ class ARROW_EXPORT SelectKSinkNodeOptions : public SinkNodeOptions { public: explicit SelectKSinkNodeOptions( SelectKOptions select_k_options, - std::function>()>* generator) + std::function>()>* generator) : SinkNodeOptions(generator), select_k_options(std::move(select_k_options)) {} /// SelectK options diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index e06c41c7489..eb560da99cf 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -35,7 +35,6 @@ #include "arrow/testing/random.h" #include "arrow/util/async_generator.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" #include "arrow/util/vector.h" @@ -220,7 +219,7 @@ TEST(ExecPlanExecution, SourceSink) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto basic_data = MakeBasicBatches(); @@ -239,7 +238,7 @@ TEST(ExecPlanExecution, SourceSink) { } TEST(ExecPlanExecution, UseSinkAfterExecution) { - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); auto basic_data = MakeBasicBatches(); @@ -260,7 +259,7 @@ TEST(ExecPlanExecution, UseSinkAfterExecution) { TEST(ExecPlanExecution, TableSourceSink) { for (int batch_size : {1, 4}) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto exp_batches = MakeBasicBatches(); ASSERT_OK_AND_ASSIGN(auto table, @@ -275,13 +274,13 @@ TEST(ExecPlanExecution, TableSourceSink) { ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen)); ASSERT_OK_AND_ASSIGN(auto out_table, TableFromExecBatches(exp_batches.schema, res)); - AssertTablesEqual(table, out_table); + AssertTablesEqualIgnoringOrder(table, out_table); } } TEST(ExecPlanExecution, TableSourceSinkError) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto exp_batches = MakeBasicBatches(); ASSERT_OK_AND_ASSIGN(auto table, @@ -296,8 +295,128 @@ TEST(ExecPlanExecution, TableSourceSinkError) { Raises(StatusCode::Invalid, HasSubstr("batch_size > 0"))); } +template +void TestSourceSinkError( + std::string source_factory_name, + std::function>(const BatchesWithSchema&)> + to_elements) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + std::shared_ptr no_schema; + + auto exp_batches = MakeBasicBatches(); + ASSERT_OK_AND_ASSIGN(auto elements, to_elements(exp_batches)); + auto element_it_maker = [&elements]() { + return MakeVectorIterator(elements); + }; + + auto null_executor_options = OptionsType{exp_batches.schema, element_it_maker}; + ASSERT_OK(MakeExecNode(source_factory_name, plan.get(), {}, null_executor_options)); + + auto null_schema_options = OptionsType{no_schema, element_it_maker}; + ASSERT_THAT(MakeExecNode(source_factory_name, plan.get(), {}, null_schema_options), + Raises(StatusCode::Invalid, HasSubstr("not null"))); +} + +template +void TestSourceSink( + std::string source_factory_name, + std::function>(const BatchesWithSchema&)> + to_elements) { + ASSERT_OK_AND_ASSIGN(auto executor, arrow::internal::ThreadPool::Make(1)); + ExecContext exec_context(default_memory_pool(), executor.get()); + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_context)); + AsyncGenerator> sink_gen; + + auto exp_batches = MakeBasicBatches(); + ASSERT_OK_AND_ASSIGN(auto elements, to_elements(exp_batches)); + auto element_it_maker = [&elements]() { + return MakeVectorIterator(elements); + }; + + ASSERT_OK(Declaration::Sequence({ + {source_factory_name, + OptionsType{exp_batches.schema, element_it_maker}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray(exp_batches.batches)))); +} + +void TestRecordBatchReaderSourceSink( + std::function>(const BatchesWithSchema&)> + to_reader) { + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); + auto exp_batches = MakeBasicBatches(); + ASSERT_OK_AND_ASSIGN(std::shared_ptr reader, + to_reader(exp_batches)); + RecordBatchReaderSourceNodeOptions options{reader}; + Declaration plan("record_batch_reader_source", std::move(options)); + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(plan, parallel)); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, + exp_batches.batches); + } +} + +void TestRecordBatchReaderSourceSinkError( + std::function>(const BatchesWithSchema&)> + to_reader) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + auto source_factory_name = "record_batch_reader_source"; + auto exp_batches = MakeBasicBatches(); + ASSERT_OK_AND_ASSIGN(std::shared_ptr reader, to_reader(exp_batches)); + + auto null_executor_options = RecordBatchReaderSourceNodeOptions{reader}; + ASSERT_OK(MakeExecNode(source_factory_name, plan.get(), {}, null_executor_options)); + + std::shared_ptr no_reader; + auto null_reader_options = RecordBatchReaderSourceNodeOptions{no_reader}; + ASSERT_THAT(MakeExecNode(source_factory_name, plan.get(), {}, null_reader_options), + Raises(StatusCode::Invalid, HasSubstr("not null"))); +} + +TEST(ExecPlanExecution, ArrayVectorSourceSink) { + TestSourceSink, ArrayVectorSourceNodeOptions>( + "array_vector_source", ToArrayVectors); +} + +TEST(ExecPlanExecution, ArrayVectorSourceSinkError) { + TestSourceSinkError, ArrayVectorSourceNodeOptions>( + "array_vector_source", ToArrayVectors); +} + +TEST(ExecPlanExecution, ExecBatchSourceSink) { + TestSourceSink, ExecBatchSourceNodeOptions>( + "exec_batch_source", ToExecBatches); +} + +TEST(ExecPlanExecution, ExecBatchSourceSinkError) { + TestSourceSinkError, ExecBatchSourceNodeOptions>( + "exec_batch_source", ToExecBatches); +} + +TEST(ExecPlanExecution, RecordBatchSourceSink) { + TestSourceSink, RecordBatchSourceNodeOptions>( + "record_batch_source", ToRecordBatches); +} + +TEST(ExecPlanExecution, RecordBatchSourceSinkError) { + TestSourceSinkError, RecordBatchSourceNodeOptions>( + "record_batch_source", ToRecordBatches); +} + +TEST(ExecPlanExecution, RecordBatchReaderSourceSink) { + TestRecordBatchReaderSourceSink(ToRecordBatchReader); +} + +TEST(ExecPlanExecution, RecordBatchReaderSourceSinkError) { + TestRecordBatchReaderSourceSinkError(ToRecordBatchReader); +} + TEST(ExecPlanExecution, SinkNodeBackpressure) { - util::optional batch = + std::optional batch = ExecBatchFromJSON({int32(), boolean()}, "[[4, false], [5, null], [6, false], [7, false], [null, true]]"); constexpr uint32_t kPauseIfAbove = 4; @@ -307,18 +426,19 @@ TEST(ExecPlanExecution, SinkNodeBackpressure) { uint32_t resume_if_below_bytes = kResumeIfBelow * static_cast(batch->TotalBufferSize()); EXPECT_OK_AND_ASSIGN(std::shared_ptr plan, ExecPlan::Make()); - PushGenerator> batch_producer; - AsyncGenerator> sink_gen; + PushGenerator> batch_producer; + AsyncGenerator> sink_gen; BackpressureMonitor* backpressure_monitor; BackpressureOptions backpressure_options(resume_if_below_bytes, pause_if_above_bytes); std::shared_ptr schema_ = schema({field("data", uint32())}); - ARROW_EXPECT_OK(compute::Declaration::Sequence( - { - {"source", SourceNodeOptions(schema_, batch_producer)}, - {"sink", SinkNodeOptions{&sink_gen, backpressure_options, - &backpressure_monitor}}, - }) - .AddToPlan(plan.get())); + ARROW_EXPECT_OK( + compute::Declaration::Sequence( + { + {"source", SourceNodeOptions(schema_, batch_producer)}, + {"sink", SinkNodeOptions{&sink_gen, /*schema=*/nullptr, + backpressure_options, &backpressure_monitor}}, + }) + .AddToPlan(plan.get())); ASSERT_TRUE(backpressure_monitor); ARROW_EXPECT_OK(plan->StartProducing()); @@ -349,14 +469,14 @@ TEST(ExecPlanExecution, SinkNodeBackpressure) { ASSERT_FALSE(backpressure_monitor->is_paused()); // Cleanup - batch_producer.producer().Push(IterationEnd>()); + batch_producer.producer().Push(IterationEnd>()); plan->StopProducing(); ASSERT_FINISHES_OK(plan->finished()); } TEST(ExecPlan, ToString) { auto basic_data = MakeBasicBatches(); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); ASSERT_OK(Declaration::Sequence( @@ -462,7 +582,7 @@ TEST(ExecPlanExecution, SourceOrderBy) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto basic_data = MakeBasicBatches(); @@ -483,16 +603,16 @@ TEST(ExecPlanExecution, SourceOrderBy) { TEST(ExecPlanExecution, SourceSinkError) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto basic_data = MakeBasicBatches(); auto it = basic_data.batches.begin(); - AsyncGenerator> error_source_gen = - [&]() -> Result> { + AsyncGenerator> error_source_gen = + [&]() -> Result> { if (it == basic_data.batches.end()) { return Status::Invalid("Artificial error"); } - return util::make_optional(*it++); + return std::make_optional(*it++); }; ASSERT_OK(Declaration::Sequence( @@ -520,7 +640,7 @@ TEST(ExecPlanExecution, SourceConsumingSink) { : batches_seen(batches_seen), finish(std::move(finish)) {} Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { + BackpressureControl* backpressure_control, ExecPlan* plan) override { return Status::OK(); } @@ -548,12 +668,12 @@ TEST(ExecPlanExecution, SourceConsumingSink) { // Source should finish fairly quickly ASSERT_FINISHES_OK(source->finished()); SleepABit(); - ASSERT_EQ(2, batches_seen); // Consumer isn't finished and so plan shouldn't have finished AssertNotFinished(plan->finished()); // Mark consumption complete, plan should finish finish.MarkFinished(); ASSERT_FINISHES_OK(plan->finished()); + ASSERT_EQ(2, batches_seen); } } } @@ -566,7 +686,7 @@ TEST(ExecPlanExecution, SourceTableConsumingSink) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - std::shared_ptr
out; + std::shared_ptr
out = nullptr; auto basic_data = MakeBasicBatches(); @@ -581,11 +701,11 @@ TEST(ExecPlanExecution, SourceTableConsumingSink) { // Source should finish fairly quickly ASSERT_FINISHES_OK(source->finished()); SleepABit(); - ASSERT_OK_AND_ASSIGN(auto actual, + ASSERT_OK_AND_ASSIGN(auto expected, TableFromExecBatches(basic_data.schema, basic_data.batches)); - ASSERT_EQ(5, out->num_rows()); - AssertTablesEqual(*actual, *out); ASSERT_FINISHES_OK(plan->finished()); + ASSERT_EQ(5, out->num_rows()); + AssertTablesEqualIgnoringOrder(expected, out); } } } @@ -594,7 +714,7 @@ TEST(ExecPlanExecution, ConsumingSinkNames) { struct SchemaKeepingConsumer : public SinkNodeConsumer { std::shared_ptr schema_; Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { + BackpressureControl* backpressure_control, ExecPlan* plan) override { schema_ = schema; return Status::OK(); } @@ -632,7 +752,7 @@ TEST(ExecPlanExecution, ConsumingSinkNames) { TEST(ExecPlanExecution, ConsumingSinkError) { struct InitErrorConsumer : public SinkNodeConsumer { Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { + BackpressureControl* backpressure_control, ExecPlan* plan) override { return Status::Invalid("XYZ"); } Status Consume(ExecBatch batch) override { return Status::OK(); } @@ -640,7 +760,7 @@ TEST(ExecPlanExecution, ConsumingSinkError) { }; struct ConsumeErrorConsumer : public SinkNodeConsumer { Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { + BackpressureControl* backpressure_control, ExecPlan* plan) override { return Status::OK(); } Status Consume(ExecBatch batch) override { return Status::Invalid("XYZ"); } @@ -648,7 +768,7 @@ TEST(ExecPlanExecution, ConsumingSinkError) { }; struct FinishErrorConsumer : public SinkNodeConsumer { Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { + BackpressureControl* backpressure_control, ExecPlan* plan) override { return Status::OK(); } Status Consume(ExecBatch batch) override { return Status::OK(); } @@ -659,27 +779,12 @@ TEST(ExecPlanExecution, ConsumingSinkError) { std::make_shared()}; for (auto& consumer : consumers) { - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); auto basic_data = MakeBasicBatches(); - ASSERT_OK(Declaration::Sequence( - {{"source", - SourceNodeOptions(basic_data.schema, basic_data.gen(false, false))}, - {"consuming_sink", ConsumingSinkNodeOptions(consumer)}}) - .AddToPlan(plan.get())); - ASSERT_OK_AND_ASSIGN( - auto source, - MakeExecNode("source", plan.get(), {}, - SourceNodeOptions(basic_data.schema, basic_data.gen(false, false)))); - ASSERT_OK(MakeExecNode("consuming_sink", plan.get(), {source}, - ConsumingSinkNodeOptions(consumer))); - // If we fail at init we see it during StartProducing. Other - // failures are not seen until we start running. - if (std::dynamic_pointer_cast(consumer)) { - ASSERT_RAISES(Invalid, plan->StartProducing()); - } else { - ASSERT_OK(plan->StartProducing()); - ASSERT_FINISHES_AND_RAISES(Invalid, plan->finished()); - } + Declaration plan = Declaration::Sequence( + {{"source", SourceNodeOptions(basic_data.schema, basic_data.gen(false, false))}, + {"consuming_sink", ConsumingSinkNodeOptions(consumer)}}); + // Since the source node is not parallel the entire plan is run during StartProducing + ASSERT_RAISES(Invalid, DeclarationToStatus(std::move(plan))); } } @@ -693,7 +798,7 @@ TEST(ExecPlanExecution, StressSourceSink) { int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto random_data = MakeRandomBatches( schema({field("a", int32()), field("b", boolean())}), num_batches); @@ -723,7 +828,7 @@ TEST(ExecPlanExecution, StressSourceOrderBy) { int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto random_data = MakeRandomBatches(input_schema, num_batches); @@ -744,7 +849,9 @@ TEST(ExecPlanExecution, StressSourceOrderBy) { TableFromExecBatches(input_schema, random_data.batches)); ASSERT_OK_AND_ASSIGN(auto sort_indices, SortIndices(original, options)); ASSERT_OK_AND_ASSIGN(auto expected, Take(original, sort_indices)); - AssertTablesEqual(*actual, *expected.table()); + AssertSchemaEqual(actual->schema(), expected.table()->schema()); + AssertArraysEqual(*actual->column(0)->chunk(0), + *expected.table()->column(0)->chunk(0)); } } } @@ -760,7 +867,7 @@ TEST(ExecPlanExecution, StressSourceGroupedSumStop) { int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto random_data = MakeRandomBatches(input_schema, num_batches); @@ -795,7 +902,7 @@ TEST(ExecPlanExecution, StressSourceSinkStopped) { int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto random_data = MakeRandomBatches( schema({field("a", int32()), field("b", boolean())}), num_batches); @@ -823,7 +930,7 @@ TEST(ExecPlanExecution, SourceFilterSink) { auto basic_data = MakeBasicBatches(); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ASSERT_OK(Declaration::Sequence( { @@ -845,7 +952,7 @@ TEST(ExecPlanExecution, SourceProjectSink) { auto basic_data = MakeBasicBatches(); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ASSERT_OK(Declaration::Sequence( { @@ -905,31 +1012,30 @@ BatchesWithSchema MakeGroupableBatches(int multiplicity = 1) { } // namespace TEST(ExecPlanExecution, SourceGroupedSum) { + std::shared_ptr out_schema = + schema({field("sum(i32)", int64()), field("str", utf8())}); + const std::shared_ptr
expected_parallel = + TableFromJSON(out_schema, {R"([[800, "alfa"], [1000, "beta"], [400, "gama"]])"}); + const std::shared_ptr
expected_single = + TableFromJSON(out_schema, {R"([[8, "alfa"], [10, "beta"], [4, "gama"]])"}); + for (bool parallel : {false, true}) { SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); auto input = MakeGroupableBatches(/*multiplicity=*/parallel ? 100 : 1); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + Declaration plan = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32", "sum(i32)"}}, + /*keys=*/{"str"}}}}); - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, - "i32", "sum(i32)"}}, - /*keys=*/{"str"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK_AND_ASSIGN(std::shared_ptr
actual, + DeclarationToTable(std::move(plan), parallel)); - ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), - Finishes(ResultWith(UnorderedElementsAreArray({ExecBatchFromJSON( - {int64(), utf8()}, - parallel ? R"([[800, "alfa"], [1000, "beta"], [400, "gama"]])" - : R"([[8, "alfa"], [10, "beta"], [4, "gama"]])")})))); + auto expected = parallel ? expected_parallel : expected_single; + + AssertTablesEqualIgnoringOrder(expected, actual); } } @@ -945,7 +1051,7 @@ TEST(ExecPlanExecution, SourceMinMaxScalar) { R"({"min": -8, "max": 12})")}); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; // NOTE: Test `ScalarAggregateNode` by omitting `keys` attribute ASSERT_OK(Declaration::Sequence( @@ -976,7 +1082,7 @@ TEST(ExecPlanExecution, NestedSourceFilter) { ])"); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ASSERT_OK(Declaration::Sequence( { @@ -998,34 +1104,26 @@ TEST(ExecPlanExecution, NestedSourceProjectGroupedSum) { SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); auto input = MakeNestedBatches(); - auto expected = ExecBatchFromJSON({int64(), boolean()}, R"([ + auto expected = + TableFromJSON(schema({field("x", int64()), field("y", boolean())}), {R"([ [null, true], [17, false], [5, null] -])"); - - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; +])"}); - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, - {"project", ProjectNodeOptions{{ - field_ref(FieldRef("struct", "i32")), - field_ref(FieldRef("struct", "bool")), - }, - {"i32", "bool"}}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, - "i32", "sum(i32)"}}, - /*keys=*/{"bool"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + Declaration plan = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"project", ProjectNodeOptions{{ + field_ref(FieldRef("struct", "i32")), + field_ref(FieldRef("struct", "bool")), + }, + {"i32", "bool"}}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32", "sum(i32)"}}, + /*keys=*/{"bool"}}}}); - ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), - Finishes(ResultWith(UnorderedElementsAreArray({expected})))); + ASSERT_OK_AND_ASSIGN(auto actual, DeclarationToTable(std::move(plan), parallel)); + AssertTablesEqualIgnoringOrder(expected, actual); } } @@ -1036,35 +1134,25 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumFilter) { int batch_multiplicity = parallel ? 100 : 1; auto input = MakeGroupableBatches(/*multiplicity=*/batch_multiplicity); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; - - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, - {"filter", - FilterNodeOptions{greater_equal(field_ref("i32"), literal(0))}}, - {"project", ProjectNodeOptions{{ - field_ref("str"), - call("multiply", {field_ref("i32"), literal(2)}), - }}}, - {"aggregate", - AggregateNodeOptions{ - /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)", - "sum(multiply(i32, 2))"}}, - /*keys=*/{"str"}}}, - {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), - literal(10 * batch_multiplicity))}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); - - ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), - Finishes(ResultWith(UnorderedElementsAreArray({ExecBatchFromJSON( - {int64(), utf8()}, parallel ? R"([[3600, "alfa"], [2000, "beta"]])" - : R"([[36, "alfa"], [20, "beta"]])")})))); + Declaration plan = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"filter", FilterNodeOptions{greater_equal(field_ref("i32"), literal(0))}}, + {"project", ProjectNodeOptions{{ + field_ref("str"), + call("multiply", {field_ref("i32"), literal(2)}), + }}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)", + "sum(multiply(i32, 2))"}}, + /*keys=*/{"str"}}}, + {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), + literal(10 * batch_multiplicity))}}}); + + auto expected = TableFromJSON(schema({field("a", int64()), field("b", utf8())}), + {parallel ? R"([[3600, "alfa"], [2000, "beta"]])" + : R"([[36, "alfa"], [20, "beta"]])"}); + ASSERT_OK_AND_ASSIGN(auto actual, DeclarationToTable(std::move(plan), parallel)); + AssertTablesEqualIgnoringOrder(expected, actual); } } @@ -1076,7 +1164,7 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumOrderBy) { auto input = MakeGroupableBatches(/*multiplicity=*/batch_multiplicity); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; SortOptions options({SortKey("str", SortOrder::Descending)}); ASSERT_OK( @@ -1116,7 +1204,7 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumTopK) { auto input = MakeGroupableBatches(/*multiplicity=*/batch_multiplicity); ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; SelectKOptions options = SelectKOptions::TopKDefault(/*k=*/1, {"str"}); ASSERT_OK(Declaration::Sequence( @@ -1145,7 +1233,7 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumTopK) { TEST(ExecPlanExecution, SourceScalarAggSink) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; auto basic_data = MakeBasicBatches(); @@ -1174,59 +1262,46 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { // ARROW-13638: aggregation nodes initialize per-thread kernel state lazily // and need to keep a copy/strong reference to function options { - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; - auto basic_data = MakeBasicBatches(); - + Future> table_future; { auto options = std::make_shared(TDigestOptions::Defaults()); - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{basic_data.schema, - basic_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{ + Declaration plan = Declaration::Sequence( + {{"source", + SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{ /*aggregates=*/{{"tdigest", options, "i32", "tdigest(i32)"}}, - }}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + }}}); + table_future = DeclarationToTableAsync(std::move(plan)); } - ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), - Finishes(ResultWith(UnorderedElementsAreArray({ - ExecBatchFromJSON({float64()}, "[[5.5]]"), - })))); + std::shared_ptr
expected = + TableFromJSON(schema({field("tdigest(i32)", float64())}), {"[[5.5]]"}); + + ASSERT_FINISHES_OK_AND_ASSIGN(std::shared_ptr
actual, table_future); + AssertTablesEqualIgnoringOrder(expected, actual); } { - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; - auto data = MakeGroupableBatches(/*multiplicity=*/100); - + Future> table_future; { auto options = std::make_shared(CountOptions::Defaults()); - ASSERT_OK( - Declaration::Sequence( - { - {"source", SourceNodeOptions{data.schema, data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{ - /*aggregates=*/{{"hash_count", options, "i32", "count(i32)"}}, - /*keys=*/{"str"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + Declaration plan = Declaration::Sequence( + {{"source", SourceNodeOptions{data.schema, data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_count", options, + "i32", "count(i32)"}}, + /*keys=*/{"str"}}}}); + table_future = DeclarationToTableAsync(std::move(plan)); } - ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), - Finishes(ResultWith(UnorderedElementsAreArray({ - ExecBatchFromJSON({int64(), utf8()}, - R"([[500, "alfa"], [200, "beta"], [200, "gama"]])"), - })))); + std::shared_ptr
expected = + TableFromJSON(schema({field("count(i32)", int64()), field("str", utf8())}), + {R"([[500, "alfa"], [200, "beta"], [200, "gama"]])"}); + + ASSERT_FINISHES_OK_AND_ASSIGN(std::shared_ptr
actual, table_future); + AssertTablesEqualIgnoringOrder(expected, actual); } } @@ -1234,7 +1309,7 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { // ARROW-9056: scalar aggregation can be done over scalars, taking // into account batch.length > 1 (e.g. a partition column) ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; BatchesWithSchema scalar_data; scalar_data.batches = { @@ -1278,9 +1353,10 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { } TEST(ExecPlanExecution, ScalarSourceGroupedSum) { - // ARROW-14630: ensure grouped aggregation with a scalar key/array input doesn't error + // ARROW-14630: ensure grouped aggregation with a scalar key/array input doesn't + // error ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; BatchesWithSchema scalar_data; scalar_data.batches = { @@ -1317,28 +1393,13 @@ TEST(ExecPlanExecution, SelfInnerHashJoinSink) { auto input = MakeGroupableBatches(); - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); + auto left = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"filter", FilterNodeOptions{greater_equal(field_ref("i32"), literal(-1))}}}); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - AsyncGenerator> sink_gen; - - ExecNode* left_source; - ExecNode* right_source; - for (auto source : {&left_source, &right_source}) { - ASSERT_OK_AND_ASSIGN( - *source, MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input.schema, - input.gen(parallel, /*slow=*/false)})); - } - ASSERT_OK_AND_ASSIGN( - auto left_filter, - MakeExecNode("filter", plan.get(), {left_source}, - FilterNodeOptions{greater_equal(field_ref("i32"), literal(-1))})); - ASSERT_OK_AND_ASSIGN( - auto right_filter, - MakeExecNode("filter", plan.get(), {right_source}, - FilterNodeOptions{less_equal(field_ref("i32"), literal(2))})); + auto right = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"filter", FilterNodeOptions{less_equal(field_ref("i32"), literal(2))}}}); // left side: [3, "alfa"], [3, "alfa"], [12, "alfa"], [3, "beta"], [7, "beta"], // [-1, "gama"], [5, "gama"] @@ -1348,14 +1409,9 @@ TEST(ExecPlanExecution, SelfInnerHashJoinSink) { /*left_keys=*/{"str"}, /*right_keys=*/{"str"}, literal(true), "l_", "r_"}; - ASSERT_OK_AND_ASSIGN( - auto hashjoin, - MakeExecNode("hashjoin", plan.get(), {left_filter, right_filter}, join_opts)); - - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin}, - SinkNodeOptions{&sink_gen})); + auto plan = Declaration("hashjoin", {left, right}, std::move(join_opts)); - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(plan, parallel)); std::vector expected = { ExecBatchFromJSON({int32(), utf8(), int32(), utf8()}, R"([ @@ -1364,7 +1420,7 @@ TEST(ExecPlanExecution, SelfInnerHashJoinSink) { [12, "alfa", -2, "alfa"], [12, "alfa", -8, "alfa"], [-1, "gama", -1, "gama"], [5, "gama", -1, "gama"]])")}; - AssertExecBatchesEqual(hashjoin->output_schema(), result, expected); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); } } @@ -1374,28 +1430,13 @@ TEST(ExecPlanExecution, SelfOuterHashJoinSink) { auto input = MakeGroupableBatches(); - auto exec_ctx = arrow::internal::make_unique( - default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); + auto left = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"filter", FilterNodeOptions{greater_equal(field_ref("i32"), literal(-1))}}}); - ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); - AsyncGenerator> sink_gen; - - ExecNode* left_source; - ExecNode* right_source; - for (auto source : {&left_source, &right_source}) { - ASSERT_OK_AND_ASSIGN( - *source, MakeExecNode("source", plan.get(), {}, - SourceNodeOptions{input.schema, - input.gen(parallel, /*slow=*/false)})); - } - ASSERT_OK_AND_ASSIGN( - auto left_filter, - MakeExecNode("filter", plan.get(), {left_source}, - FilterNodeOptions{greater_equal(field_ref("i32"), literal(-1))})); - ASSERT_OK_AND_ASSIGN( - auto right_filter, - MakeExecNode("filter", plan.get(), {right_source}, - FilterNodeOptions{less_equal(field_ref("i32"), literal(2))})); + auto right = Declaration::Sequence( + {{"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"filter", FilterNodeOptions{less_equal(field_ref("i32"), literal(2))}}}); // left side: [3, "alfa"], [3, "alfa"], [12, "alfa"], [3, "beta"], [7, "beta"], // [-1, "gama"], [5, "gama"] @@ -1405,14 +1446,9 @@ TEST(ExecPlanExecution, SelfOuterHashJoinSink) { /*left_keys=*/{"str"}, /*right_keys=*/{"str"}, literal(true), "l_", "r_"}; - ASSERT_OK_AND_ASSIGN( - auto hashjoin, - MakeExecNode("hashjoin", plan.get(), {left_filter, right_filter}, join_opts)); - - ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {hashjoin}, - SinkNodeOptions{&sink_gen})); + auto plan = Declaration("hashjoin", {left, right}, std::move(join_opts)); - ASSERT_FINISHES_OK_AND_ASSIGN(auto result, StartAndCollect(plan.get(), sink_gen)); + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(plan, parallel)); std::vector expected = { ExecBatchFromJSON({int32(), utf8(), int32(), utf8()}, R"([ @@ -1422,13 +1458,13 @@ TEST(ExecPlanExecution, SelfOuterHashJoinSink) { [3, "beta", null, null], [7, "beta", null, null], [-1, "gama", -1, "gama"], [5, "gama", -1, "gama"]])")}; - AssertExecBatchesEqual(hashjoin->output_schema(), result, expected); + AssertExecBatchesEqualIgnoringOrder(result.schema, result.batches, expected); } } TEST(ExecPlan, RecordBatchReaderSourceSink) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; // set up a RecordBatchReader: auto input = MakeBasicBatches(); @@ -1464,7 +1500,7 @@ TEST(ExecPlan, SourceEnforcesBatchLimit) { schema({field("a", int32()), field("b", boolean())}), /*num_batches=*/3, /*batch_size=*/static_cast(std::floor(ExecPlan::kMaxBatchSize * 3.5))); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ASSERT_OK(Declaration::Sequence( { diff --git a/cpp/src/arrow/compute/exec/project_benchmark.cc b/cpp/src/arrow/compute/exec/project_benchmark.cc index cb4fdc4ffdf..9414fa89059 100644 --- a/cpp/src/arrow/compute/exec/project_benchmark.cc +++ b/cpp/src/arrow/compute/exec/project_benchmark.cc @@ -44,11 +44,10 @@ static void ProjectionOverhead(benchmark::State& state, Expression expr) { arrow::compute::BatchesWithSchema data = MakeRandomBatches( schema({field("i64", int64()), field("bool", boolean())}), num_batches, batch_size); - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); std::vector project_node_dec = { {"project", ProjectNodeOptions{{expr}}}}; ASSERT_OK( - BenchmarkNodeOverhead(state, ctx, num_batches, batch_size, data, project_node_dec)); + BenchmarkNodeOverhead(state, num_batches, batch_size, data, project_node_dec)); } static void ProjectionOverheadIsolated(benchmark::State& state, Expression expr) { @@ -57,9 +56,8 @@ static void ProjectionOverheadIsolated(benchmark::State& state, Expression expr) arrow::compute::BatchesWithSchema data = MakeRandomBatches( schema({field("i64", int64()), field("bool", boolean())}), num_batches, batch_size); - ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ProjectNodeOptions options = ProjectNodeOptions{{expr}}; - ASSERT_OK(BenchmarkIsolatedNodeOverhead(state, ctx, expr, num_batches, batch_size, data, + ASSERT_OK(BenchmarkIsolatedNodeOverhead(state, expr, num_batches, batch_size, data, "project", options)); } diff --git a/cpp/src/arrow/compute/exec/project_node.cc b/cpp/src/arrow/compute/exec/project_node.cc index 76925eb6139..5e8c2245a2b 100644 --- a/cpp/src/arrow/compute/exec/project_node.cc +++ b/cpp/src/arrow/compute/exec/project_node.cc @@ -21,7 +21,9 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/map_node.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/util.h" #include "arrow/datum.h" #include "arrow/result.h" @@ -40,9 +42,8 @@ namespace { class ProjectNode : public MapNode { public: ProjectNode(ExecPlan* plan, std::vector inputs, - std::shared_ptr output_schema, std::vector exprs, - bool async_mode) - : MapNode(plan, std::move(inputs), std::move(output_schema), async_mode), + std::shared_ptr output_schema, std::vector exprs) + : MapNode(plan, std::move(inputs), std::move(output_schema)), exprs_(std::move(exprs)) {} static Result Make(ExecPlan* plan, std::vector inputs, @@ -64,15 +65,14 @@ class ProjectNode : public MapNode { int i = 0; for (auto& expr : exprs) { if (!expr.IsBound()) { - ARROW_ASSIGN_OR_RAISE( - expr, expr.Bind(*inputs[0]->output_schema(), plan->exec_context())); + ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*inputs[0]->output_schema(), + plan->query_context()->exec_context())); } fields[i] = field(std::move(names[i]), expr.type()->GetSharedPtr()); ++i; } return plan->EmplaceNode(plan, std::move(inputs), - schema(std::move(fields)), std::move(exprs), - project_options.async_mode); + schema(std::move(fields)), std::move(exprs)); } const char* kind_name() const override { return "ProjectNode"; } @@ -88,8 +88,9 @@ class ProjectNode : public MapNode { ARROW_ASSIGN_OR_RAISE(Expression simplified_expr, SimplifyWithGuarantee(exprs_[i], target.guarantee)); - ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target, - plan()->exec_context())); + ARROW_ASSIGN_OR_RAISE( + values[i], ExecuteScalarExpression(simplified_expr, target, + plan()->query_context()->exec_context())); } return ExecBatch{std::move(values), target.length}; } diff --git a/cpp/src/arrow/compute/exec/query_context.cc b/cpp/src/arrow/compute/exec/query_context.cc new file mode 100644 index 00000000000..a155c750a2a --- /dev/null +++ b/cpp/src/arrow/compute/exec/query_context.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/query_context.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/io_util.h" + +namespace arrow { +using internal::CpuInfo; +namespace compute { +QueryOptions::QueryOptions() : use_legacy_batching(false) {} + +QueryContext::QueryContext(QueryOptions opts, ExecContext exec_context) + : options_(opts), + exec_context_(exec_context), + io_context_(exec_context_.memory_pool()) {} + +const CpuInfo* QueryContext::cpu_info() const { return CpuInfo::GetInstance(); } +int64_t QueryContext::hardware_flags() const { return cpu_info()->hardware_flags(); } + +Status QueryContext::Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler) { + tld_.resize(max_num_threads); + async_scheduler_ = scheduler; + return Status::OK(); +} + +size_t QueryContext::GetThreadIndex() { return thread_indexer_(); } + +size_t QueryContext::max_concurrency() const { return thread_indexer_.Capacity(); } + +Result QueryContext::GetTempStack(size_t thread_index) { + if (!tld_[thread_index].is_init) { + RETURN_NOT_OK(tld_[thread_index].stack.Init( + memory_pool(), 8 * util::MiniBatch::kMiniBatchLength * sizeof(uint64_t))); + tld_[thread_index].is_init = true; + } + return &tld_[thread_index].stack; +} + +Result> QueryContext::BeginExternalTask() { + Future<> completion_future = Future<>::Make(); + if (async_scheduler_->AddSimpleTask( + [completion_future] { return completion_future; })) { + return completion_future; + } + return Future<>{}; +} + +Status QueryContext::ScheduleTask(std::function fn) { + ::arrow::internal::Executor* exec = executor(); + // Adds a task which submits fn to the executor and tracks its progress. If we're + // already stopping then the task is ignored and fn is not executed. + async_scheduler_->AddSimpleTask([exec, fn]() { return exec->Submit(std::move(fn)); }); + return Status::OK(); +} + +Status QueryContext::ScheduleTask(std::function fn) { + std::function indexed_fn = [this, fn]() { + size_t thread_index = GetThreadIndex(); + return fn(thread_index); + }; + return ScheduleTask(std::move(indexed_fn)); +} + +Status QueryContext::ScheduleIOTask(std::function fn) { + async_scheduler_->AddSimpleTask( + [this, fn]() { return io_context_.executor()->Submit(std::move(fn)); }); + return Status::OK(); +} + +int QueryContext::RegisterTaskGroup(std::function task, + std::function on_finished) { + return task_scheduler_->RegisterTaskGroup(std::move(task), std::move(on_finished)); +} + +Status QueryContext::StartTaskGroup(int task_group_id, int64_t num_tasks) { + return task_scheduler_->StartTaskGroup(GetThreadIndex(), task_group_id, num_tasks); +} +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/query_context.h b/cpp/src/arrow/compute/exec/query_context.h new file mode 100644 index 00000000000..12ddbc56fad --- /dev/null +++ b/cpp/src/arrow/compute/exec/query_context.h @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/task_util.h" +#include "arrow/compute/exec/util.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/async_util.h" + +#pragma once + +namespace arrow { + +using io::IOContext; +namespace compute { +struct ARROW_EXPORT QueryOptions { + QueryOptions(); + + /// \brief Should the plan use a legacy batching strategy + /// + /// This is currently in place only to support the Scanner::ToTable + /// method. This method relies on batch indices from the scanner + /// remaining consistent. This is impractical in the ExecPlan which + /// might slice batches as needed (e.g. for a join) + /// + /// However, it still works for simple plans and this is the only way + /// we have at the moment for maintaining implicit order. + bool use_legacy_batching; +}; + +class ARROW_EXPORT QueryContext { + public: + QueryContext(QueryOptions opts = {}, + ExecContext exec_context = *default_exec_context()); + + Status Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler); + + const ::arrow::internal::CpuInfo* cpu_info() const; + int64_t hardware_flags() const; + const QueryOptions& options() const { return options_; } + MemoryPool* memory_pool() const { return exec_context_.memory_pool(); } + ::arrow::internal::Executor* executor() const { return exec_context_.executor(); } + ExecContext* exec_context() { return &exec_context_; } + IOContext* io_context() { return &io_context_; } + TaskScheduler* scheduler() { return task_scheduler_.get(); } + util::AsyncTaskScheduler* async_scheduler() { return async_scheduler_; } + + size_t GetThreadIndex(); + size_t max_concurrency() const; + Result GetTempStack(size_t thread_index); + + /// \brief Start an external task + /// + /// This should be avoided if possible. It is kept in for now for legacy + /// purposes. This should be called before the external task is started. If + /// a valid future is returned then it should be marked complete when the + /// external task has finished. + /// + /// \return an invalid future if the plan has already ended, otherwise this + /// returns a future that must be completed when the external task + /// finishes. + Result> BeginExternalTask(); + + /// \brief Add a single function as a task to the query's task group + /// on the compute threadpool. + /// + /// \param fn The task to run. Takes no arguments and returns a Status. + Status ScheduleTask(std::function fn); + /// \brief Add a single function as a task to the query's task group + /// on the compute threadpool. + /// + /// \param fn The task to run. Takes the thread index and returns a Status. + Status ScheduleTask(std::function fn); + /// \brief Add a single function as a task to the query's task group on + /// the IO thread pool + /// + /// \param fn The task to run. Returns a status. + Status ScheduleIOTask(std::function fn); + + // Register/Start TaskGroup is a way of performing a "Parallel For" pattern: + // - The task function takes the thread index and the index of the task + // - The on_finished function takes the thread index + // Returns an integer ID that will be used to reference the task group in + // StartTaskGroup. At runtime, call StartTaskGroup with the ID and the number of times + // you'd like the task to be executed. The need to register a task group before use will + // be removed after we rewrite the scheduler. + /// \brief Register a "parallel for" task group with the scheduler + /// + /// \param task The function implementing the task. Takes the thread_index and + /// the task index. + /// \param on_finished The function that gets run once all tasks have been completed. + /// Takes the thread_index. + /// + /// Must be called inside of ExecNode::Init. + int RegisterTaskGroup(std::function task, + std::function on_finished); + + /// \brief Start the task group with the specified ID. This can only + /// be called once per task_group_id. + /// + /// \param task_group_id The ID of the task group to run + /// \param num_tasks The number of times to run the task + Status StartTaskGroup(int task_group_id, int64_t num_tasks); + + // This is an RAII class for keeping track of in-flight file IO. Useful for getting + // an estimate of memory use, and how much memory we expect to be freed soon. + // Returned by ReportTempFileIO. + struct [[nodiscard]] TempFileIOMark { + QueryContext* ctx_; + size_t bytes_; + + TempFileIOMark(QueryContext* ctx, size_t bytes) : ctx_(ctx), bytes_(bytes) { + ctx_->in_flight_bytes_to_disk_.fetch_add(bytes_, std::memory_order_acquire); + } + + ARROW_DISALLOW_COPY_AND_ASSIGN(TempFileIOMark); + + ~TempFileIOMark() { + ctx_->in_flight_bytes_to_disk_.fetch_sub(bytes_, std::memory_order_release); + } + }; + + TempFileIOMark ReportTempFileIO(size_t bytes) { return {this, bytes}; } + + size_t GetCurrentTempFileIO() { return in_flight_bytes_to_disk_.load(); } + + private: + QueryOptions options_; + // To be replaced with Acero-specific context once scheduler is done and + // we don't need ExecContext for kernels + ExecContext exec_context_; + IOContext io_context_; + + util::AsyncTaskScheduler* async_scheduler_ = NULLPTR; + std::unique_ptr task_scheduler_ = TaskScheduler::Make(); + + ThreadIndexer thread_indexer_; + struct ThreadLocalData { + bool is_init = false; + util::TempVectorStack stack; + }; + std::vector tld_; + + std::atomic in_flight_bytes_to_disk_{0}; +}; +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/sink_node.cc b/cpp/src/arrow/compute/exec/sink_node.cc index a1426265cf9..2ecce751135 100644 --- a/cpp/src/arrow/compute/exec/sink_node.cc +++ b/cpp/src/arrow/compute/exec/sink_node.cc @@ -16,7 +16,9 @@ // specific language governing permissions and limitations // under the License. +#include #include +#include #include "arrow/compute/api_vector.h" #include "arrow/compute/exec.h" @@ -24,6 +26,7 @@ #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/order_by_impl.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/exec_internal.h" #include "arrow/datum.h" @@ -34,7 +37,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/logging.h" -#include "arrow/util/optional.h" #include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" #include "arrow/util/unreachable.h" @@ -54,8 +56,14 @@ class BackpressureReservoir : public BackpressureMonitor { resume_if_below_(resume_if_below), pause_if_above_(pause_if_above) {} - uint64_t bytes_in_use() const override { return bytes_used_; } - bool is_paused() const override { return state_change_counter_ % 2 == 1; } + uint64_t bytes_in_use() override { + std::lock_guard lg(mutex_); + return bytes_used_; + } + bool is_paused() override { + std::lock_guard lg(mutex_); + return state_change_counter_ % 2 == 1; + } bool enabled() const { return pause_if_above_ > 0; } int32_t RecordProduced(uint64_t num_bytes) { @@ -89,8 +97,8 @@ class BackpressureReservoir : public BackpressureMonitor { class SinkNode : public ExecNode { public: SinkNode(ExecPlan* plan, std::vector inputs, - AsyncGenerator>* generator, - BackpressureOptions backpressure, + AsyncGenerator>* generator, + std::shared_ptr* schema, BackpressureOptions backpressure, BackpressureMonitor** backpressure_monitor_out) : ExecNode(plan, std::move(inputs), {"collected"}, {}, /*num_outputs=*/0), @@ -102,12 +110,15 @@ class SinkNode : public ExecNode { *backpressure_monitor_out = &backpressure_queue_; } auto node_destroyed_capture = node_destroyed_; - *generator = [this, node_destroyed_capture]() -> Future> { + if (schema) { + *schema = inputs_[0]->output_schema(); + } + *generator = [this, node_destroyed_capture]() -> Future> { if (*node_destroyed_capture) { return Status::Invalid( "Attempt to consume data after the plan has been destroyed"); } - return push_gen_().Then([this](const util::optional& batch) { + return push_gen_().Then([this](const std::optional& batch) { if (batch) { RecordBackpressureBytesFreed(*batch); } @@ -125,7 +136,7 @@ class SinkNode : public ExecNode { const auto& sink_options = checked_cast(options); RETURN_NOT_OK(ValidateOptions(sink_options)); return plan->EmplaceNode(plan, std::move(inputs), sink_options.generator, - sink_options.backpressure, + sink_options.schema, sink_options.backpressure, sink_options.backpressure_monitor); } @@ -247,8 +258,8 @@ class SinkNode : public ExecNode { // Needs to be a shared_ptr as the push generator can technically outlive the node BackpressureReservoir backpressure_queue_; - PushGenerator> push_gen_; - PushGenerator>::Producer producer_; + PushGenerator> push_gen_; + PushGenerator>::Producer producer_; std::shared_ptr node_destroyed_; }; @@ -303,7 +314,7 @@ class ConsumingSinkNode : public ExecNode, public BackpressureControl { } output_schema = schema(std::move(fields)); } - RETURN_NOT_OK(consumer_->Init(output_schema, this)); + RETURN_NOT_OK(consumer_->Init(output_schema, this, plan_)); return Status::OK(); } @@ -325,8 +336,9 @@ class ConsumingSinkNode : public ExecNode, public BackpressureControl { void StopProducing() override { EVENT(span_, "StopProducing"); - Finish(Status::OK()); - inputs_[0]->StopProducing(this); + if (input_counter_.Cancel()) { + Finish(Status::OK()); + } } void InputReceived(ExecNode* input, ExecBatch batch) override { @@ -376,65 +388,24 @@ class ConsumingSinkNode : public ExecNode, public BackpressureControl { protected: void Finish(const Status& finish_st) { - consumer_->Finish().AddCallback([this, finish_st](const Status& st) { - // Prefer the plan error over the consumer error - Status final_status = finish_st & st; - finished_.MarkFinished(std::move(final_status)); - }); + if (finish_st.ok()) { + plan_->query_context()->async_scheduler()->AddSimpleTask( + [this] { return consumer_->Finish(); }); + } + finished_.MarkFinished(finish_st); } AtomicCounter input_counter_; std::shared_ptr consumer_; std::vector names_; - int32_t backpressure_counter_ = 0; + std::atomic backpressure_counter_ = 0; }; - -/** - * @brief This node is an extension on ConsumingSinkNode - * to facilitate to get the output from an execution plan - * as a table. We define a custom SinkNodeConsumer to - * enable this functionality. - */ - -struct TableSinkNodeConsumer : public SinkNodeConsumer { - public: - TableSinkNodeConsumer(std::shared_ptr
* out, MemoryPool* pool) - : out_(out), pool_(pool) {} - - Status Init(const std::shared_ptr& schema, - BackpressureControl* backpressure_control) override { - // If the user is collecting into a table then backpressure is meaningless - ARROW_UNUSED(backpressure_control); - schema_ = schema; - return Status::OK(); - } - - Status Consume(ExecBatch batch) override { - std::lock_guard guard(consume_mutex_); - ARROW_ASSIGN_OR_RAISE(auto rb, batch.ToRecordBatch(schema_, pool_)); - batches_.push_back(rb); - return Status::OK(); - } - - Future<> Finish() override { - ARROW_ASSIGN_OR_RAISE(*out_, Table::FromRecordBatches(batches_)); - return Status::OK(); - } - - private: - std::shared_ptr
* out_; - MemoryPool* pool_; - std::shared_ptr schema_; - std::vector> batches_; - std::mutex consume_mutex_; -}; - static Result MakeTableConsumingSinkNode( compute::ExecPlan* plan, std::vector inputs, const compute::ExecNodeOptions& options) { RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 1, "TableConsumingSinkNode")); const auto& sink_options = checked_cast(options); - MemoryPool* pool = plan->exec_context()->memory_pool(); + MemoryPool* pool = plan->query_context()->memory_pool(); auto tb_consumer = std::make_shared(sink_options.output_table, pool); auto consuming_sink_node_options = ConsumingSinkNodeOptions{tb_consumer}; @@ -445,8 +416,9 @@ static Result MakeTableConsumingSinkNode( struct OrderBySinkNode final : public SinkNode { OrderBySinkNode(ExecPlan* plan, std::vector inputs, std::unique_ptr impl, - AsyncGenerator>* generator) - : SinkNode(plan, std::move(inputs), generator, /*backpressure=*/{}, + AsyncGenerator>* generator) + : SinkNode(plan, std::move(inputs), generator, /*schema=*/nullptr, + /*backpressure=*/{}, /*backpressure_monitor_out=*/nullptr), impl_(std::move(impl)) {} @@ -464,8 +436,8 @@ struct OrderBySinkNode final : public SinkNode { RETURN_NOT_OK(ValidateOrderByOptions(sink_options)); ARROW_ASSIGN_OR_RAISE( std::unique_ptr impl, - OrderByImpl::MakeSort(plan->exec_context(), inputs[0]->output_schema(), - sink_options.sort_options)); + OrderByImpl::MakeSort(plan->query_context()->exec_context(), + inputs[0]->output_schema(), sink_options.sort_options)); return plan->EmplaceNode(plan, std::move(inputs), std::move(impl), sink_options.generator); } @@ -494,10 +466,10 @@ struct OrderBySinkNode final : public SinkNode { return Status::Invalid("Backpressure cannot be applied to an OrderBySinkNode"); } RETURN_NOT_OK(ValidateSelectKOptions(sink_options)); - ARROW_ASSIGN_OR_RAISE( - std::unique_ptr impl, - OrderByImpl::MakeSelectK(plan->exec_context(), inputs[0]->output_schema(), - sink_options.select_k_options)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr impl, + OrderByImpl::MakeSelectK(plan->query_context()->exec_context(), + inputs[0]->output_schema(), + sink_options.select_k_options)); return plan->EmplaceNode(plan, std::move(inputs), std::move(impl), sink_options.generator); } @@ -519,7 +491,7 @@ struct OrderBySinkNode final : public SinkNode { DCHECK_EQ(input, inputs_[0]); auto maybe_batch = batch.ToRecordBatch(inputs_[0]->output_schema(), - plan()->exec_context()->memory_pool()); + plan()->query_context()->memory_pool()); if (ErrorIfNotOk(maybe_batch.status())) { StopProducing(); if (input_counter_.Cancel()) { diff --git a/cpp/src/arrow/compute/exec/source_node.cc b/cpp/src/arrow/compute/exec/source_node.cc index a640cf737ef..76c222f5b76 100644 --- a/cpp/src/arrow/compute/exec/source_node.cc +++ b/cpp/src/arrow/compute/exec/source_node.cc @@ -15,15 +15,19 @@ // specific language governing permissions and limitations // under the License. +#include #include +#include #include "arrow/compute/exec.h" #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/exec_internal.h" #include "arrow/datum.h" +#include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/table.h" #include "arrow/util/async_generator.h" @@ -31,7 +35,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/logging.h" -#include "arrow/util/optional.h" #include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" #include "arrow/util/unreachable.h" @@ -47,7 +50,7 @@ namespace { struct SourceNode : ExecNode { SourceNode(ExecPlan* plan, std::shared_ptr output_schema, - AsyncGenerator> generator) + AsyncGenerator> generator) : ExecNode(plan, {}, {}, std::move(output_schema), /*num_outputs=*/1), generator_(std::move(generator)) {} @@ -89,7 +92,7 @@ struct SourceNode : ExecNode { } CallbackOptions options; - auto executor = plan()->exec_context()->executor(); + auto executor = plan()->query_context()->executor(); if (executor) { // These options will transfer execution to the desired Executor if necessary. // This can happen for in-memory scans where batches didn't require @@ -98,7 +101,8 @@ struct SourceNode : ExecNode { options.executor = executor; options.should_schedule = ShouldSchedule::IfDifferentExecutor; } - ARROW_ASSIGN_OR_RAISE(Future<> scan_task, plan_->BeginExternalTask()); + ARROW_ASSIGN_OR_RAISE(Future<> scan_task, + plan_->query_context()->BeginExternalTask()); if (!scan_task.is_valid()) { finished_.MarkFinished(); // Plan has already been aborted, no need to start scanning @@ -112,14 +116,15 @@ struct SourceNode : ExecNode { lock.unlock(); return generator_().Then( - [=](const util::optional& maybe_morsel) + [this](const std::optional& maybe_morsel) -> Future> { std::unique_lock lock(mutex_); if (IsIterationEnd(maybe_morsel) || stop_requested_) { return Break(batch_count_); } lock.unlock(); - bool use_legacy_batching = plan_->UseLegacyBatching(); + bool use_legacy_batching = + plan_->query_context()->options().use_legacy_batching; ExecBatch morsel = std::move(*maybe_morsel); int64_t morsel_length = static_cast(morsel.length); if (use_legacy_batching || morsel_length == 0) { @@ -131,22 +136,24 @@ struct SourceNode : ExecNode { bit_util::CeilDiv(morsel_length, ExecPlan::kMaxBatchSize)); batch_count_ += num_batches; } - RETURN_NOT_OK(plan_->ScheduleTask([=]() { - int64_t offset = 0; - do { - int64_t batch_size = std::min( - morsel_length - offset, ExecPlan::kMaxBatchSize); - // In order for the legacy batching model to work we must - // not slice batches from the source - if (use_legacy_batching) { - batch_size = morsel_length; - } - ExecBatch batch = morsel.Slice(offset, batch_size); - offset += batch_size; - outputs_[0]->InputReceived(this, std::move(batch)); - } while (offset < morsel.length); - return Status::OK(); - })); + RETURN_NOT_OK(plan_->query_context()->ScheduleTask( + [this, morsel = std::move(morsel), morsel_length, + use_legacy_batching]() { + int64_t offset = 0; + do { + int64_t batch_size = std::min( + morsel_length - offset, ExecPlan::kMaxBatchSize); + // In order for the legacy batching model to work we must + // not slice batches from the source + if (use_legacy_batching) { + batch_size = morsel_length; + } + ExecBatch batch = morsel.Slice(offset, batch_size); + offset += batch_size; + outputs_[0]->InputReceived(this, std::move(batch)); + } while (offset < morsel.length); + return Status::OK(); + })); lock.lock(); if (!backpressure_future_.is_finished()) { EVENT(span_, "Source paused due to backpressure"); @@ -155,7 +162,7 @@ struct SourceNode : ExecNode { } return Future>::MakeFinished(Continue()); }, - [=](const Status& error) -> ControlFlow { + [this](const Status& error) -> ControlFlow { outputs_[0]->ErrorReceived(this, error); return Break(batch_count_); }, @@ -216,12 +223,12 @@ struct SourceNode : ExecNode { private: std::mutex mutex_; - int32_t backpressure_counter_{0}; + std::atomic backpressure_counter_{0}; Future<> backpressure_future_ = Future<>::MakeFinished(); bool stop_requested_{false}; bool started_ = false; int batch_count_{0}; - AsyncGenerator> generator_; + AsyncGenerator> generator_; }; struct TableSourceNode : public SourceNode { @@ -257,13 +264,13 @@ struct TableSourceNode : public SourceNode { return Status::OK(); } - static arrow::AsyncGenerator> TableGenerator( + static arrow::AsyncGenerator> TableGenerator( const Table& table, const int64_t batch_size) { auto batches = ConvertTableToExecBatches(table, batch_size); auto opt_batches = - MapVector([](ExecBatch batch) { return util::make_optional(std::move(batch)); }, + MapVector([](ExecBatch batch) { return std::make_optional(std::move(batch)); }, std::move(batches)); - AsyncGenerator> gen; + AsyncGenerator> gen; gen = MakeVectorGenerator(std::move(opt_batches)); return gen; } @@ -291,6 +298,192 @@ struct TableSourceNode : public SourceNode { } }; +template +struct SchemaSourceNode : public SourceNode { + SchemaSourceNode(ExecPlan* plan, std::shared_ptr schema, + arrow::AsyncGenerator> generator) + : SourceNode(plan, schema, generator) {} + + static Result Make(ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options) { + RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, This::kKindName)); + const auto& cast_options = checked_cast(options); + auto& it_maker = cast_options.it_maker; + auto& schema = cast_options.schema; + auto io_executor = cast_options.io_executor; + + if (io_executor == NULLPTR) { + io_executor = plan->query_context()->exec_context()->executor(); + } + auto it = it_maker(); + + if (schema == NULLPTR) { + return Status::Invalid(This::kKindName, " requires schema which is not null"); + } + if (io_executor == NULLPTR) { + io_executor = io::internal::GetIOThreadPool(); + } + + ARROW_ASSIGN_OR_RAISE(auto generator, This::MakeGenerator(it, io_executor, schema)); + return plan->EmplaceNode(plan, schema, generator); + } +}; + +struct RecordBatchReaderSourceNode : public SourceNode { + RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr schema, + arrow::AsyncGenerator> generator) + : SourceNode(plan, schema, generator) {} + + static Result Make(ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options) { + RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, kKindName)); + const auto& cast_options = + checked_cast(options); + auto& reader = cast_options.reader; + auto io_executor = cast_options.io_executor; + + if (reader == nullptr) { + return Status::Invalid(kKindName, " requires a reader which is not null"); + } + + if (io_executor == nullptr) { + io_executor = io::internal::GetIOThreadPool(); + } + + ARROW_ASSIGN_OR_RAISE(auto generator, MakeGenerator(reader, io_executor)); + return plan->EmplaceNode(plan, reader->schema(), + generator); + } + + static Result>> MakeGenerator( + const std::shared_ptr& reader, + arrow::internal::Executor* io_executor) { + auto to_exec_batch = + [](const std::shared_ptr& batch) -> std::optional { + if (batch == NULLPTR) { + return std::nullopt; + } + return std::optional(ExecBatch(*batch)); + }; + Iterator> batch_it = MakeIteratorFromReader(reader); + auto exec_batch_it = MakeMapIterator(to_exec_batch, std::move(batch_it)); + return MakeBackgroundGenerator(std::move(exec_batch_it), io_executor); + } + + static const char kKindName[]; +}; + +const char RecordBatchReaderSourceNode::kKindName[] = "RecordBatchReaderSourceNode"; + +struct RecordBatchSourceNode + : public SchemaSourceNode { + using RecordBatchSchemaSourceNode = + SchemaSourceNode; + + using RecordBatchSchemaSourceNode::RecordBatchSchemaSourceNode; + + static Result Make(ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options) { + return RecordBatchSchemaSourceNode::Make(plan, inputs, options); + } + + const char* kind_name() const override { return kKindName; } + + static Result>> MakeGenerator( + Iterator>& batch_it, + arrow::internal::Executor* io_executor, const std::shared_ptr& schema) { + auto to_exec_batch = + [schema](const std::shared_ptr& batch) -> std::optional { + if (batch == NULLPTR || *batch->schema() != *schema) { + return std::nullopt; + } + return std::optional(ExecBatch(*batch)); + }; + auto exec_batch_it = MakeMapIterator(to_exec_batch, std::move(batch_it)); + return MakeBackgroundGenerator(std::move(exec_batch_it), io_executor); + } + + static const char kKindName[]; +}; + +const char RecordBatchSourceNode::kKindName[] = "RecordBatchSourceNode"; + +struct ExecBatchSourceNode + : public SchemaSourceNode { + using ExecBatchSchemaSourceNode = + SchemaSourceNode; + + using ExecBatchSchemaSourceNode::ExecBatchSchemaSourceNode; + + static Result Make(ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options) { + return ExecBatchSchemaSourceNode::Make(plan, inputs, options); + } + + const char* kind_name() const override { return kKindName; } + + static Result>> MakeGenerator( + Iterator>& batch_it, + arrow::internal::Executor* io_executor, const std::shared_ptr& schema) { + auto to_exec_batch = + [](const std::shared_ptr& batch) -> std::optional { + return batch == NULLPTR ? std::nullopt : std::optional(*batch); + }; + auto exec_batch_it = MakeMapIterator(to_exec_batch, std::move(batch_it)); + return MakeBackgroundGenerator(std::move(exec_batch_it), io_executor); + } + + static const char kKindName[]; +}; + +const char ExecBatchSourceNode::kKindName[] = "ExecBatchSourceNode"; + +struct ArrayVectorSourceNode + : public SchemaSourceNode { + using ArrayVectorSchemaSourceNode = + SchemaSourceNode; + + using ArrayVectorSchemaSourceNode::ArrayVectorSchemaSourceNode; + + static Result Make(ExecPlan* plan, std::vector inputs, + const ExecNodeOptions& options) { + return ArrayVectorSchemaSourceNode::Make(plan, inputs, options); + } + + const char* kind_name() const override { return kKindName; } + + static Result>> MakeGenerator( + Iterator>& arrayvec_it, + arrow::internal::Executor* io_executor, const std::shared_ptr& schema) { + auto to_exec_batch = + [](const std::shared_ptr& arrayvec) -> std::optional { + if (arrayvec == NULLPTR || arrayvec->size() == 0) { + return std::nullopt; + } + std::vector datumvec; + for (const auto& array : *arrayvec) { + datumvec.push_back(Datum(array)); + } + return std::optional( + ExecBatch(std::move(datumvec), (*arrayvec)[0]->length())); + }; + auto exec_batch_it = MakeMapIterator(to_exec_batch, std::move(arrayvec_it)); + return MakeBackgroundGenerator(std::move(exec_batch_it), io_executor); + } + + static const char kKindName[]; +}; + +const char ArrayVectorSourceNode::kKindName[] = "ArrayVectorSourceNode"; + +Result MakeNamedTableNode(compute::ExecPlan* plan, + std::vector inputs, + const compute::ExecNodeOptions& options) { + return Status::Invalid( + "The named table node is for serialization purposes only and can never be " + "converted into an exec plan or executed"); +} + } // namespace namespace internal { @@ -298,6 +491,12 @@ namespace internal { void RegisterSourceNode(ExecFactoryRegistry* registry) { DCHECK_OK(registry->AddFactory("source", SourceNode::Make)); DCHECK_OK(registry->AddFactory("table_source", TableSourceNode::Make)); + DCHECK_OK(registry->AddFactory("record_batch_source", RecordBatchSourceNode::Make)); + DCHECK_OK(registry->AddFactory("record_batch_reader_source", + RecordBatchReaderSourceNode::Make)); + DCHECK_OK(registry->AddFactory("exec_batch_source", ExecBatchSourceNode::Make)); + DCHECK_OK(registry->AddFactory("array_vector_source", ArrayVectorSourceNode::Make)); + DCHECK_OK(registry->AddFactory("named_table", MakeNamedTableNode)); } } // namespace internal diff --git a/cpp/src/arrow/compute/exec/subtree_internal.h b/cpp/src/arrow/compute/exec/subtree_internal.h index 72d419df225..9e55af6068f 100644 --- a/cpp/src/arrow/compute/exec/subtree_internal.h +++ b/cpp/src/arrow/compute/exec/subtree_internal.h @@ -18,13 +18,13 @@ #pragma once #include +#include #include #include #include #include #include "arrow/compute/exec/expression.h" -#include "arrow/util/optional.h" namespace arrow { namespace compute { @@ -64,7 +64,7 @@ struct SubtreeImpl { struct Encoded { // An external index identifying the corresponding object (e.g. a Fragment) of the // guarantee. - util::optional index; + std::optional index; // An encoded expression representing a guarantee. expression_codes guarantee; }; @@ -112,7 +112,7 @@ struct SubtreeImpl { void GenerateSubtrees(expression_codes guarantee, std::vector* encoded) { while (!guarantee.empty()) { if (subtree_exprs_.insert(guarantee).second) { - Encoded encoded_subtree{/*index=*/util::nullopt, guarantee}; + Encoded encoded_subtree{/*index=*/std::nullopt, guarantee}; encoded->push_back(std::move(encoded_subtree)); } guarantee.resize(guarantee.size() - 1); diff --git a/cpp/src/arrow/compute/exec/subtree_test.cc b/cpp/src/arrow/compute/exec/subtree_test.cc index 97213104454..908af3be7ef 100644 --- a/cpp/src/arrow/compute/exec/subtree_test.cc +++ b/cpp/src/arrow/compute/exec/subtree_test.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -26,9 +27,12 @@ #include "arrow/compute/exec/forest_internal.h" #include "arrow/compute/exec/subtree_internal.h" #include "arrow/testing/gtest_util.h" -#include "arrow/util/string_view.h" +#include "arrow/util/string.h" namespace arrow { + +using internal::StartsWith; + namespace compute { using testing::ContainerEq; @@ -94,18 +98,18 @@ struct TestPathTree { using PT = TestPathTree; -util::string_view RemoveTrailingSlash(util::string_view key) { +std::string_view RemoveTrailingSlash(std::string_view key) { while (!key.empty() && key.back() == '/') { key.remove_suffix(1); } return key; } -bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) { +bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { // See filesystem/path_util.h ancestor = RemoveTrailingSlash(ancestor); if (ancestor == "") return true; descendant = RemoveTrailingSlash(descendant); - if (!descendant.starts_with(ancestor)) return false; + if (!StartsWith(descendant, ancestor)) return false; descendant.remove_prefix(ancestor.size()); if (descendant.empty()) return true; return descendant.front() == '/'; @@ -327,9 +331,9 @@ TEST(Subtree, GetSubtreeExpression) { const auto code_a = tree.GetOrInsert(expr_a); const auto code_b = tree.GetOrInsert(expr_b); ASSERT_EQ(expr_a, - tree.GetSubtreeExpression(SubtreeImpl::Encoded{util::nullopt, {code_a}})); + tree.GetSubtreeExpression(SubtreeImpl::Encoded{std::nullopt, {code_a}})); ASSERT_EQ(expr_b, tree.GetSubtreeExpression( - SubtreeImpl::Encoded{util::nullopt, {code_a, code_b}})); + SubtreeImpl::Encoded{std::nullopt, {code_a, code_b}})); } class FakeFragment { @@ -363,14 +367,14 @@ TEST(Subtree, EncodeFragments) { EXPECT_THAT( encoded, testing::UnorderedElementsAreArray({ - SubtreeImpl::Encoded{util::make_optional(0), + SubtreeImpl::Encoded{std::make_optional(0), SubtreeImpl::expression_codes({0, 1})}, - SubtreeImpl::Encoded{util::make_optional(1), + SubtreeImpl::Encoded{std::make_optional(1), SubtreeImpl::expression_codes({2, 3})}, - SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0})}, - SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2})}, - SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0, 1})}, - SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2, 3})}, + SubtreeImpl::Encoded{std::nullopt, SubtreeImpl::expression_codes({0})}, + SubtreeImpl::Encoded{std::nullopt, SubtreeImpl::expression_codes({2})}, + SubtreeImpl::Encoded{std::nullopt, SubtreeImpl::expression_codes({0, 1})}, + SubtreeImpl::Encoded{std::nullopt, SubtreeImpl::expression_codes({2, 3})}, })); } } // namespace compute diff --git a/cpp/src/arrow/compute/exec/swiss_join.cc b/cpp/src/arrow/compute/exec/swiss_join.cc index 5b01edb1198..fee3c5f79db 100644 --- a/cpp/src/arrow/compute/exec/swiss_join.cc +++ b/cpp/src/arrow/compute/exec/swiss_join.cc @@ -2022,7 +2022,7 @@ Status JoinProbeProcessor::OnFinished() { class SwissJoin : public HashJoinImpl { public: - Status Init(ExecContext* ctx, JoinType join_type, size_t num_threads, + Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, std::vector key_cmp, Expression filter, @@ -2067,8 +2067,6 @@ class SwissJoin : public HashJoinImpl { for (int i = 0; i < num_threads_; ++i) { local_states_[i].hash_table_ready = false; local_states_[i].num_output_batches = 0; - RETURN_NOT_OK(CancelIfNotOK(local_states_[i].temp_stack.Init( - pool_, 1024 + 64 * util::MiniBatch::kMiniBatchLength))); local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right); } @@ -2116,10 +2114,12 @@ class SwissJoin : public HashJoinImpl { ExecBatch keypayload_batch; ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch)); + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * temp_stack, + ctx_->GetTempStack(thread_index)); - return CancelIfNotOK(probe_processor_.OnNextBatch( - thread_index, keypayload_batch, &local_states_[thread_index].temp_stack, - &local_states_[thread_index].temp_column_arrays)); + return CancelIfNotOK( + probe_processor_.OnNextBatch(thread_index, keypayload_batch, temp_stack, + &local_states_[thread_index].temp_column_arrays)); } Status ProbingFinished(size_t thread_index) override { @@ -2225,9 +2225,11 @@ class SwissJoin : public HashJoinImpl { input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol]; } } + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * temp_stack, + ctx_->GetTempStack(thread_id)); RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch( static_cast(thread_id), key_batch, no_payload ? nullptr : &payload_batch, - &local_states_[thread_id].temp_stack))); + temp_stack))); // Release input batch // @@ -2259,7 +2261,9 @@ class SwissJoin : public HashJoinImpl { Status MergeFinished(size_t thread_id) { RETURN_NOT_OK(status()); - hash_table_build_.FinishPrtnMerge(&local_states_[thread_id].temp_stack); + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * temp_stack, + ctx_->GetTempStack(thread_id)); + hash_table_build_.FinishPrtnMerge(temp_stack); return CancelIfNotOK(OnBuildHashTableFinished(static_cast(thread_id))); } @@ -2311,7 +2315,8 @@ class SwissJoin : public HashJoinImpl { std::min((task_id + 1) * kNumRowsPerScanTask, hash_table_.num_rows()); // Get thread index and related temp vector stack // - util::TempVectorStack* temp_stack = &local_states_[thread_id].temp_stack; + ARROW_ASSIGN_OR_RAISE(util::TempVectorStack * temp_stack, + ctx_->GetTempStack(thread_id)); // Split into mini-batches // @@ -2467,7 +2472,7 @@ class SwissJoin : public HashJoinImpl { static constexpr int kNumRowsPerScanTask = 512 * 1024; - ExecContext* ctx_; + QueryContext* ctx_; int64_t hardware_flags_; MemoryPool* pool_; int num_threads_; @@ -2489,7 +2494,6 @@ class SwissJoin : public HashJoinImpl { struct ThreadLocalState { JoinResultMaterialize materialize; - util::TempVectorStack temp_stack; std::vector temp_column_arrays; int64_t num_output_batches; bool hash_table_ready; diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc index cc26143179a..72ddbbeb0d4 100644 --- a/cpp/src/arrow/compute/exec/test_util.cc +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,6 @@ #include "arrow/util/async_generator.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" -#include "arrow/util/optional.h" #include "arrow/util/unreachable.h" #include "arrow/util/vector.h" @@ -142,8 +142,7 @@ ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector& types, - util::string_view json) { +ExecBatch ExecBatchFromJSON(const std::vector& types, std::string_view json) { auto fields = ::arrow::internal::MapVector( [](const TypeHolder& th) { return field("", th.GetSharedPtr()); }, types); @@ -153,7 +152,7 @@ ExecBatch ExecBatchFromJSON(const std::vector& types, } ExecBatch ExecBatchFromJSON(const std::vector& types, - const std::vector& shapes, util::string_view json) { + const std::vector& shapes, std::string_view json) { DCHECK_EQ(types.size(), shapes.size()); ExecBatch batch = ExecBatchFromJSON(types, json); @@ -180,17 +179,17 @@ Future<> StartAndFinish(ExecPlan* plan) { } Future> StartAndCollect( - ExecPlan* plan, AsyncGenerator> gen) { + ExecPlan* plan, AsyncGenerator> gen) { RETURN_NOT_OK(plan->Validate()); RETURN_NOT_OK(plan->StartProducing()); auto collected_fut = CollectAsyncGenerator(gen); - return AllComplete({plan->finished(), Future<>(collected_fut)}) + return AllFinished({plan->finished(), Future<>(collected_fut)}) .Then([collected_fut]() -> Result> { ARROW_ASSIGN_OR_RAISE(auto collected, collected_fut.result()); return ::arrow::internal::MapVector( - [](util::optional batch) { return std::move(*batch); }, + [](std::optional batch) { return batch.value_or(ExecBatch()); }, std::move(collected)); }); } @@ -219,25 +218,25 @@ BatchesWithSchema MakeNestedBatches() { } BatchesWithSchema MakeRandomBatches(const std::shared_ptr& schema, - int num_batches, int batch_size) { + int num_batches, int batch_size, int64_t alignment, + MemoryPool* memory_pool) { BatchesWithSchema out; random::RandomArrayGenerator rng(42); out.batches.resize(num_batches); for (int i = 0; i < num_batches; ++i) { - out.batches[i] = ExecBatch(*rng.BatchOf(schema->fields(), batch_size)); - // add a tag scalar to ensure the batches are unique - out.batches[i].values.emplace_back(i); + out.batches[i] = + ExecBatch(*rng.BatchOf(schema->fields(), batch_size, alignment, memory_pool)); } out.schema = schema; return out; } -BatchesWithSchema MakeBatchesFromString( - const std::shared_ptr& schema, - const std::vector& json_strings, int multiplicity) { +BatchesWithSchema MakeBatchesFromString(const std::shared_ptr& schema, + const std::vector& json_strings, + int multiplicity) { BatchesWithSchema out_batches{{}, schema}; std::vector types; @@ -259,18 +258,61 @@ BatchesWithSchema MakeBatchesFromString( return out_batches; } +Result>> ToArrayVectors( + const BatchesWithSchema& batches_with_schema) { + std::vector> arrayvecs; + for (auto batch : batches_with_schema.batches) { + ARROW_ASSIGN_OR_RAISE(auto record_batch, + batch.ToRecordBatch(batches_with_schema.schema)); + arrayvecs.push_back(std::make_shared(record_batch->columns())); + } + return arrayvecs; +} + +Result>> ToExecBatches( + const BatchesWithSchema& batches_with_schema) { + std::vector> exec_batches; + for (auto batch : batches_with_schema.batches) { + exec_batches.push_back(std::make_shared(batch)); + } + return exec_batches; +} + +Result>> ToRecordBatches( + const BatchesWithSchema& batches_with_schema) { + std::vector> record_batches; + for (auto batch : batches_with_schema.batches) { + ARROW_ASSIGN_OR_RAISE(auto record_batch, + batch.ToRecordBatch(batches_with_schema.schema)); + record_batches.push_back(std::move(record_batch)); + } + return record_batches; +} + +Result> ToRecordBatchReader( + const BatchesWithSchema& batches_with_schema) { + std::vector> record_batches; + for (auto batch : batches_with_schema.batches) { + ARROW_ASSIGN_OR_RAISE(auto record_batch, + batch.ToRecordBatch(batches_with_schema.schema)); + record_batches.push_back(std::move(record_batch)); + } + ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches(std::move(record_batches))); + return std::make_shared(std::move(table)); +} + Result> SortTableOnAllFields(const std::shared_ptr
& tab) { std::vector sort_keys; - for (auto&& f : tab->schema()->fields()) { - sort_keys.emplace_back(f->name()); + for (int i = 0; i < tab->num_columns(); i++) { + sort_keys.emplace_back(i); } ARROW_ASSIGN_OR_RAISE(auto sort_ids, SortIndices(tab, SortOptions(sort_keys))); ARROW_ASSIGN_OR_RAISE(auto tab_sorted, Take(tab, sort_ids)); return tab_sorted.table(); } -void AssertTablesEqual(const std::shared_ptr
& exp, - const std::shared_ptr
& act) { +void AssertTablesEqualIgnoringOrder(const std::shared_ptr
& exp, + const std::shared_ptr
& act) { ASSERT_EQ(exp->num_columns(), act->num_columns()); if (exp->num_rows() == 0) { ASSERT_EQ(exp->num_rows(), act->num_rows()); @@ -283,12 +325,12 @@ void AssertTablesEqual(const std::shared_ptr
& exp, } } -void AssertExecBatchesEqual(const std::shared_ptr& schema, - const std::vector& exp, - const std::vector& act) { +void AssertExecBatchesEqualIgnoringOrder(const std::shared_ptr& schema, + const std::vector& exp, + const std::vector& act) { ASSERT_OK_AND_ASSIGN(auto exp_tab, TableFromExecBatches(schema, exp)); ASSERT_OK_AND_ASSIGN(auto act_tab, TableFromExecBatches(schema, act)); - AssertTablesEqual(exp_tab, act_tab); + AssertTablesEqualIgnoringOrder(exp_tab, act_tab); } template @@ -424,7 +466,7 @@ void PrintTo(const Declaration& decl, std::ostream* os) { *os << "{"; for (const auto& input : decl.inputs) { - if (auto decl = util::get_if(&input)) { + if (auto decl = std::get_if(&input)) { PrintTo(*decl, os); } } diff --git a/cpp/src/arrow/compute/exec/test_util.h b/cpp/src/arrow/compute/exec/test_util.h index ac9a4ae4ced..1eb50223249 100644 --- a/cpp/src/arrow/compute/exec/test_util.h +++ b/cpp/src/arrow/compute/exec/test_util.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "arrow/compute/exec.h" @@ -31,7 +32,6 @@ #include "arrow/testing/visibility.h" #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" -#include "arrow/util/string_view.h" namespace arrow { namespace compute { @@ -45,7 +45,7 @@ ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector& types, util::string_view json); +ExecBatch ExecBatchFromJSON(const std::vector& types, std::string_view json); /// \brief Shape qualifier for value types. In certain instances /// (e.g. "map_lookup" kernel), an argument may only be a scalar, where in @@ -54,17 +54,17 @@ enum class ArgShape { ANY, ARRAY, SCALAR }; ARROW_TESTING_EXPORT ExecBatch ExecBatchFromJSON(const std::vector& types, - const std::vector& shapes, util::string_view json); + const std::vector& shapes, std::string_view json); struct BatchesWithSchema { std::vector batches; std::shared_ptr schema; - AsyncGenerator> gen(bool parallel, bool slow) const { + AsyncGenerator> gen(bool parallel, bool slow) const { auto opt_batches = ::arrow::internal::MapVector( - [](ExecBatch batch) { return util::make_optional(std::move(batch)); }, batches); + [](ExecBatch batch) { return std::make_optional(std::move(batch)); }, batches); - AsyncGenerator> gen; + AsyncGenerator> gen; if (parallel) { // emulate batches completing initial decode-after-scan on a cpu thread @@ -81,7 +81,7 @@ struct BatchesWithSchema { if (slow) { gen = - MakeMappedGenerator(std::move(gen), [](const util::optional& batch) { + MakeMappedGenerator(std::move(gen), [](const std::optional& batch) { SleepABit(); return batch; }); @@ -96,7 +96,7 @@ Future<> StartAndFinish(ExecPlan* plan); ARROW_TESTING_EXPORT Future> StartAndCollect( - ExecPlan* plan, AsyncGenerator> gen); + ExecPlan* plan, AsyncGenerator> gen); ARROW_TESTING_EXPORT BatchesWithSchema MakeBasicBatches(); @@ -106,24 +106,54 @@ BatchesWithSchema MakeNestedBatches(); ARROW_TESTING_EXPORT BatchesWithSchema MakeRandomBatches(const std::shared_ptr& schema, - int num_batches = 10, int batch_size = 4); + int num_batches = 10, int batch_size = 4, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = nullptr); ARROW_TESTING_EXPORT -BatchesWithSchema MakeBatchesFromString( - const std::shared_ptr& schema, - const std::vector& json_strings, int multiplicity = 1); +BatchesWithSchema MakeBatchesFromString(const std::shared_ptr& schema, + const std::vector& json_strings, + int multiplicity = 1); + +ARROW_TESTING_EXPORT +Result>> ToArrayVectors( + const BatchesWithSchema& batches_with_schema); + +ARROW_TESTING_EXPORT +Result>> ToExecBatches( + const BatchesWithSchema& batches); + +ARROW_TESTING_EXPORT +Result>> ToRecordBatches( + const BatchesWithSchema& batches); + +ARROW_TESTING_EXPORT +Result> ToRecordBatchReader( + const BatchesWithSchema& batches_with_schema); + +ARROW_TESTING_EXPORT +Result>> ToArrayVectors( + const BatchesWithSchema& batches_with_schema); + +ARROW_TESTING_EXPORT +Result>> ToExecBatches( + const BatchesWithSchema& batches); + +ARROW_TESTING_EXPORT +Result>> ToRecordBatches( + const BatchesWithSchema& batches); ARROW_TESTING_EXPORT Result> SortTableOnAllFields(const std::shared_ptr
& tab); ARROW_TESTING_EXPORT -void AssertTablesEqual(const std::shared_ptr
& exp, - const std::shared_ptr
& act); +void AssertTablesEqualIgnoringOrder(const std::shared_ptr
& exp, + const std::shared_ptr
& act); ARROW_TESTING_EXPORT -void AssertExecBatchesEqual(const std::shared_ptr& schema, - const std::vector& exp, - const std::vector& act); +void AssertExecBatchesEqualIgnoringOrder(const std::shared_ptr& schema, + const std::vector& exp, + const std::vector& act); ARROW_TESTING_EXPORT bool operator==(const Declaration&, const Declaration&); diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 54ac7cbdbf5..2adee26a425 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -22,17 +22,16 @@ #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" #include "arrow/testing/future_util.h" -#include "arrow/util/make_unique.h" + +#include namespace arrow { namespace compute { namespace internal { -std::shared_ptr Plan_Q1(AsyncGenerator>* sink_gen, +std::shared_ptr Plan_Q1(AsyncGenerator>* sink_gen, int scale_factor) { - ExecContext* ctx = default_exec_context(); - *ctx = ExecContext(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - std::shared_ptr plan = *ExecPlan::Make(ctx); + std::shared_ptr plan = *ExecPlan::Make(); std::unique_ptr gen = *TpchGen::Make(plan.get(), static_cast(scale_factor)); @@ -109,7 +108,7 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin static void BM_Tpch_Q1(benchmark::State& st) { for (auto _ : st) { st.PauseTiming(); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; std::shared_ptr plan = Plan_Q1(&sink_gen, static_cast(st.range(0))); st.ResumeTiming(); auto fut = StartAndCollect(plan.get(), sink_gen); diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc index d19f20eea7c..afff52beaf0 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.cc +++ b/cpp/src/arrow/compute/exec/tpch_node.cc @@ -16,14 +16,6 @@ // under the License. #include "arrow/compute/exec/tpch_node.h" -#include "arrow/buffer.h" -#include "arrow/compute/exec/exec_plan.h" -#include "arrow/util/formatting.h" -#include "arrow/util/future.h" -#include "arrow/util/io_util.h" -#include "arrow/util/make_unique.h" -#include "arrow/util/pcg_random.h" -#include "arrow/util/unreachable.h" #include #include @@ -32,10 +24,25 @@ #include #include #include +#include #include #include +#include "arrow/buffer.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/query_context.h" +#include "arrow/datum.h" +#include "arrow/util/async_util.h" +#include "arrow/util/formatting.h" +#include "arrow/util/future.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/pcg_random.h" +#include "arrow/util/unreachable.h" + namespace arrow { + using internal::checked_cast; using internal::GetRandomSeed; @@ -663,7 +670,7 @@ class PartAndPartSupplierGenerator { return SetOutputColumns(cols, kPartsuppTypes, kPartsuppNameMap, partsupp_cols_); } - Result> NextPartBatch(size_t thread_index) { + Result> NextPartBatch(size_t thread_index) { ThreadLocalData& tld = thread_local_data_[thread_index]; { std::lock_guard lock(part_output_queue_mutex_); @@ -672,7 +679,7 @@ class PartAndPartSupplierGenerator { part_output_queue_.pop(); return std::move(batch); } else if (part_rows_generated_ == part_rows_to_generate_) { - return util::nullopt; + return std::nullopt; } else { tld.partkey_start = part_rows_generated_; tld.part_to_generate = @@ -718,7 +725,7 @@ class PartAndPartSupplierGenerator { return ExecBatch::Make(std::move(part_result)); } - Result> NextPartSuppBatch(size_t thread_index) { + Result> NextPartSuppBatch(size_t thread_index) { ThreadLocalData& tld = thread_local_data_[thread_index]; { std::lock_guard lock(partsupp_output_queue_mutex_); @@ -731,7 +738,7 @@ class PartAndPartSupplierGenerator { { std::lock_guard lock(part_output_queue_mutex_); if (part_rows_generated_ == part_rows_to_generate_) { - return util::nullopt; + return std::nullopt; } else { tld.partkey_start = part_rows_generated_; tld.part_to_generate = @@ -1323,7 +1330,7 @@ class OrdersAndLineItemGenerator { return SetOutputColumns(cols, kLineitemTypes, kLineitemNameMap, lineitem_cols_); } - Result> NextOrdersBatch(size_t thread_index) { + Result> NextOrdersBatch(size_t thread_index) { ThreadLocalData& tld = thread_local_data_[thread_index]; { std::lock_guard lock(orders_output_queue_mutex_); @@ -1332,7 +1339,7 @@ class OrdersAndLineItemGenerator { orders_output_queue_.pop(); return std::move(batch); } else if (orders_rows_generated_ == orders_rows_to_generate_) { - return util::nullopt; + return std::nullopt; } else { tld.orderkey_start = orders_rows_generated_; tld.orders_to_generate = @@ -1378,7 +1385,7 @@ class OrdersAndLineItemGenerator { return ExecBatch::Make(std::move(orders_result)); } - Result> NextLineItemBatch(size_t thread_index) { + Result> NextLineItemBatch(size_t thread_index) { ThreadLocalData& tld = thread_local_data_[thread_index]; ExecBatch queued; bool from_queue = false; @@ -1400,7 +1407,7 @@ class OrdersAndLineItemGenerator { std::lock_guard lock(orders_output_queue_mutex_); if (orders_rows_generated_ == orders_rows_to_generate_) { if (from_queue) return std::move(queued); - return util::nullopt; + return std::nullopt; } tld.orderkey_start = orders_rows_generated_; @@ -2708,7 +2715,7 @@ class PartGenerator : public TpchTableGenerator { private: Status ProduceCallback(size_t thread_index) { if (done_.load()) return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + ARROW_ASSIGN_OR_RAISE(std::optional maybe_batch, gen_->NextPartBatch(thread_index)); if (!maybe_batch.has_value()) { int64_t batches_generated = gen_->part_batches_generated(); @@ -2770,7 +2777,7 @@ class PartSuppGenerator : public TpchTableGenerator { private: Status ProduceCallback(size_t thread_index) { if (done_.load()) return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + ARROW_ASSIGN_OR_RAISE(std::optional maybe_batch, gen_->NextPartSuppBatch(thread_index)); if (!maybe_batch.has_value()) { int64_t batches_generated = gen_->partsupp_batches_generated(); @@ -3089,7 +3096,7 @@ class OrdersGenerator : public TpchTableGenerator { private: Status ProduceCallback(size_t thread_index) { if (done_.load()) return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + ARROW_ASSIGN_OR_RAISE(std::optional maybe_batch, gen_->NextOrdersBatch(thread_index)); if (!maybe_batch.has_value()) { int64_t batches_generated = gen_->orders_batches_generated(); @@ -3151,7 +3158,7 @@ class LineitemGenerator : public TpchTableGenerator { private: Status ProduceCallback(size_t thread_index) { if (done_.load()) return Status::OK(); - ARROW_ASSIGN_OR_RAISE(util::optional maybe_batch, + ARROW_ASSIGN_OR_RAISE(std::optional maybe_batch, gen_->NextLineItemBatch(thread_index)); if (!maybe_batch.has_value()) { int64_t batches_generated = gen_->lineitem_batches_generated(); @@ -3374,13 +3381,18 @@ class TpchNode : public ExecNode { [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } Status StartProducing() override { - return generator_->StartProducing( - plan_->max_concurrency(), + num_running_++; + RETURN_NOT_OK(generator_->StartProducing( + plan_->query_context()->max_concurrency(), [this](ExecBatch batch) { this->OutputBatchCallback(std::move(batch)); }, [this](int64_t num_batches) { this->FinishedCallback(num_batches); }, [this](std::function func) -> Status { return this->ScheduleTaskCallback(std::move(func)); - }); + })); + if (--num_running_ == 0) { + finished_.MarkFinished(Status::OK()); + } + return Status::OK(); } void PauseProducing(ExecNode* output, int32_t counter) override { @@ -3408,23 +3420,29 @@ class TpchNode : public ExecNode { void FinishedCallback(int64_t total_num_batches) { outputs_[0]->InputFinished(this, static_cast(total_num_batches)); - finished_.MarkFinished(); + finished_generating_.store(true); } Status ScheduleTaskCallback(std::function func) { - if (finished_.is_finished()) return Status::OK(); - return plan_->ScheduleTask([this, func](size_t thread_index) { + if (finished_generating_.load()) return Status::OK(); + num_running_++; + return plan_->query_context()->ScheduleTask([this, func](size_t thread_index) { Status status = func(thread_index); if (!status.ok()) { StopProducing(); ErrorIfNotOk(status); } + if (--num_running_ == 0) { + finished_.MarkFinished(Status::OK()); + } return status; }); } const char* name_; std::unique_ptr generator_; + std::atomic finished_generating_{false}; + std::atomic num_running_{0}; }; class TpchGenImpl : public TpchGen { @@ -3459,7 +3477,7 @@ class TpchGenImpl : public TpchGen { template Result TpchGenImpl::CreateNode(const char* name, std::vector columns) { - std::unique_ptr generator = arrow::internal::make_unique(); + std::unique_ptr generator = std::make_unique(); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, name, std::move(generator)); @@ -3474,7 +3492,7 @@ Result TpchGenImpl::Part(std::vector columns) { part_and_part_supp_generator_ = std::make_shared(); } std::unique_ptr generator = - arrow::internal::make_unique(part_and_part_supp_generator_); + std::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Part", std::move(generator)); @@ -3485,7 +3503,7 @@ Result TpchGenImpl::PartSupp(std::vector columns) { part_and_part_supp_generator_ = std::make_shared(); } std::unique_ptr generator = - arrow::internal::make_unique(part_and_part_supp_generator_); + std::make_unique(part_and_part_supp_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "PartSupp", std::move(generator)); @@ -3500,7 +3518,7 @@ Result TpchGenImpl::Orders(std::vector columns) { orders_and_line_item_generator_ = std::make_shared(); } std::unique_ptr generator = - arrow::internal::make_unique(orders_and_line_item_generator_); + std::make_unique(orders_and_line_item_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Orders", std::move(generator)); @@ -3511,7 +3529,7 @@ Result TpchGenImpl::Lineitem(std::vector columns) { orders_and_line_item_generator_ = std::make_shared(); } std::unique_ptr generator = - arrow::internal::make_unique(orders_and_line_item_generator_); + std::make_unique(orders_and_line_item_generator_); RETURN_NOT_OK(generator->Init(std::move(columns), scale_factor_, batch_size_, kSeedDist(seed_rng_))); return plan_->EmplaceNode(plan_, "Lineitem", std::move(generator)); @@ -3529,9 +3547,9 @@ Result TpchGenImpl::Region(std::vector columns) { Result> TpchGen::Make(ExecPlan* plan, double scale_factor, int64_t batch_size, - util::optional seed) { + std::optional seed) { if (!seed.has_value()) seed = GetRandomSeed(); - return std::unique_ptr(new TpchGenImpl(plan, scale_factor, batch_size, *seed)); + return std::make_unique(plan, scale_factor, batch_size, *seed); } } // namespace internal diff --git a/cpp/src/arrow/compute/exec/tpch_node.h b/cpp/src/arrow/compute/exec/tpch_node.h index fb9376982b1..061b66ca436 100644 --- a/cpp/src/arrow/compute/exec/tpch_node.h +++ b/cpp/src/arrow/compute/exec/tpch_node.h @@ -18,13 +18,13 @@ #pragma once #include +#include #include #include #include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/util/optional.h" namespace arrow { namespace compute { @@ -44,7 +44,7 @@ class ARROW_EXPORT TpchGen { */ static Result> Make( ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096, - util::optional seed = util::nullopt); + std::optional seed = std::nullopt); // The below methods will create and add an ExecNode to the plan that generates // data for the desired table. If columns is empty, all columns will be generated. diff --git a/cpp/src/arrow/compute/exec/tpch_node_test.cc b/cpp/src/arrow/compute/exec/tpch_node_test.cc index fc26ce90c2e..fb1b990c46a 100644 --- a/cpp/src/arrow/compute/exec/tpch_node_test.cc +++ b/cpp/src/arrow/compute/exec/tpch_node_test.cc @@ -17,6 +17,12 @@ #include +#include +#include +#include +#include +#include + #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/tpch_node.h" @@ -27,16 +33,14 @@ #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/make_unique.h" #include "arrow/util/pcg_random.h" +#include "arrow/util/string.h" #include "arrow/util/thread_pool.h" -#include -#include -#include -#include - namespace arrow { + +using internal::StartsWith; + namespace compute { namespace internal { @@ -50,7 +54,7 @@ using TableNodeFn = Result (TpchGen::*)(std::vector); constexpr double kDefaultScaleFactor = 0.1; Status AddTableAndSinkToPlan(ExecPlan& plan, TpchGen& gen, - AsyncGenerator>& sink_gen, + AsyncGenerator>& sink_gen, TableNodeFn table) { ARROW_ASSIGN_OR_RAISE(ExecNode * table_node, ((gen.*table)({}))); Declaration sink("sink", {Declaration::Input(table_node)}, SinkNodeOptions{&sink_gen}); @@ -61,10 +65,10 @@ Status AddTableAndSinkToPlan(ExecPlan& plan, TpchGen& gen, Result> GenerateTable(TableNodeFn table, double scale_factor = kDefaultScaleFactor) { ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(&ctx)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr plan, ExecPlan::Make(ctx)); ARROW_ASSIGN_OR_RAISE(std::unique_ptr gen, TpchGen::Make(plan.get(), scale_factor)); - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; ARROW_RETURN_NOT_OK(AddTableAndSinkToPlan(*plan, *gen, sink_gen, table)); auto fut = StartAndCollect(plan.get(), sink_gen); return fut.MoveResult(); @@ -94,10 +98,10 @@ void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t } } -void VerifyStringAndNumber_Single(const util::string_view& row, - const util::string_view& prefix, const int64_t i, +void VerifyStringAndNumber_Single(const std::string_view& row, + const std::string_view& prefix, const int64_t i, const int32_t* nums, bool verify_padding) { - ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; + ASSERT_TRUE(StartsWith(row, prefix)) << row << ", prefix=" << prefix << ", i=" << i; const char* num_str = row.data() + prefix.size(); const char* num_str_end = row.data() + row.size(); int64_t num = 0; @@ -124,7 +128,7 @@ void VerifyStringAndNumber_Single(const util::string_view& row, // corresponding row in numbers. Some TPC-H data is padded to 9 zeros, which this function // can optionally verify as well. This string function verifies fixed width columns. void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers, - int byte_width, const util::string_view& prefix, + int byte_width, const std::string_view& prefix, bool verify_padding = true) { int64_t length = strings.length(); const char* str = reinterpret_cast(strings.array()->buffers[1]->data()); @@ -137,14 +141,14 @@ void VerifyStringAndNumber_FixedWidth(const Datum& strings, const Datum& numbers for (int64_t i = 0; i < length; i++) { const char* row = str + i * byte_width; - util::string_view view(row, byte_width); + std::string_view view(row, byte_width); VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); } } // Same as above but for variable length columns void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, - const util::string_view& prefix, + const std::string_view& prefix, bool verify_padding = true) { int64_t length = strings.length(); const int32_t* offsets = @@ -160,7 +164,7 @@ void VerifyStringAndNumber_Varlen(const Datum& strings, const Datum& numbers, for (int64_t i = 0; i < length; i++) { int32_t start = offsets[i]; int32_t str_len = offsets[i + 1] - offsets[i]; - util::string_view view(str + start, str_len); + std::string_view view(str + start, str_len); VerifyStringAndNumber_Single(view, prefix, i, nums, verify_padding); } } @@ -253,7 +257,7 @@ void VerifyCorrectNumberOfWords_Varlen(const Datum& d, int num_words) { int32_t start = offsets[i]; int32_t end = offsets[i + 1]; int32_t str_len = end - start; - util::string_view view(str + start, str_len); + std::string_view view(str + start, str_len); bool is_only_alphas_or_spaces = true; for (const char& c : view) { bool is_space = c == ' '; @@ -300,14 +304,14 @@ void VerifyOneOf(const Datum& d, const std::unordered_set& possibilities) // Verifies that each fixed-width row is one of the possibilities void VerifyOneOf(const Datum& d, int32_t byte_width, - const std::unordered_set& possibilities) { + const std::unordered_set& possibilities) { int64_t length = d.length(); const char* col = reinterpret_cast(d.array()->buffers[1]->data()); for (int64_t i = 0; i < length; i++) { const char* row = col + i * byte_width; int32_t row_len = 0; while (row[row_len] && row_len < byte_width) row_len++; - util::string_view view(row, row_len); + std::string_view view(row, row_len); ASSERT_TRUE(possibilities.find(view) != possibilities.end()) << view << " is not a valid string."; } @@ -331,10 +335,10 @@ void CountModifiedComments(const Datum& d, int* good_count, int* bad_count) { for (int64_t i = 0; i < length; i++) { const char* row = str + offsets[i]; int32_t row_length = offsets[i + 1] - offsets[i]; - util::string_view view(row, row_length); - bool customer = view.find("Customer") != util::string_view::npos; - bool recommends = view.find("Recommends") != util::string_view::npos; - bool complaints = view.find("Complaints") != util::string_view::npos; + std::string_view view(row, row_length); + bool customer = view.find("Customer") != std::string_view::npos; + bool recommends = view.find("Recommends") != std::string_view::npos; + bool complaints = view.find("Complaints") != std::string_view::npos; if (customer) { ASSERT_TRUE(recommends ^ complaints); if (recommends) *good_count += 1; @@ -618,9 +622,9 @@ TEST(TpchNode, AllTables) { &VerifyOrders, &VerifyLineitem, &VerifyNation, &VerifyRegion, }; - std::array>, kNumTables> gens; + std::array>, kNumTables> gens; ExecContext ctx(default_memory_pool(), arrow::internal::GetCpuThreadPool()); - ASSERT_OK_AND_ASSIGN(std::shared_ptr plan, ExecPlan::Make(&ctx)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr plan, ExecPlan::Make(ctx)); ASSERT_OK_AND_ASSIGN(std::unique_ptr gen, TpchGen::Make(plan.get(), kScaleFactor)); for (int i = 0; i < kNumTables; i++) { diff --git a/cpp/src/arrow/compute/exec/union_node.cc b/cpp/src/arrow/compute/exec/union_node.cc index e5170c2bc91..096188f4799 100644 --- a/cpp/src/arrow/compute/exec/union_node.cc +++ b/cpp/src/arrow/compute/exec/union_node.cc @@ -17,7 +17,6 @@ #include -#include "arrow/api.h" #include "arrow/compute/api.h" #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/options.h" @@ -26,12 +25,14 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/logging.h" +#include "arrow/util/string.h" #include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" namespace arrow { using internal::checked_cast; +using internal::ToChars; namespace compute { diff --git a/cpp/src/arrow/compute/exec/union_node_test.cc b/cpp/src/arrow/compute/exec/union_node_test.cc index 41aaac26d2b..d14bfe16e5f 100644 --- a/cpp/src/arrow/compute/exec/union_node_test.cc +++ b/cpp/src/arrow/compute/exec/union_node_test.cc @@ -90,7 +90,7 @@ struct TestUnionNode : public ::testing::Test { "source", SourceNodeOptions{batch.schema, batch.gen(parallel, /*slow=*/false)}}); } - AsyncGenerator> sink_gen; + AsyncGenerator> sink_gen; // Test UnionNode::Make with zero inputs if (batches.size() == 0) { diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index ae70cfcd46f..2dd1398b981 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -383,5 +383,26 @@ size_t ThreadIndexer::Check(size_t thread_index) { return thread_index; } +Status TableSinkNodeConsumer::Init(const std::shared_ptr& schema, + BackpressureControl* backpressure_control, + ExecPlan* plan) { + // If the user is collecting into a table then backpressure is meaningless + ARROW_UNUSED(backpressure_control); + schema_ = schema; + return Status::OK(); +} + +Status TableSinkNodeConsumer::Consume(ExecBatch batch) { + auto guard = consume_mutex_.Lock(); + ARROW_ASSIGN_OR_RAISE(auto rb, batch.ToRecordBatch(schema_, pool_)); + batches_.push_back(std::move(rb)); + return Status::OK(); +} + +Future<> TableSinkNodeConsumer::Finish() { + ARROW_ASSIGN_OR_RAISE(*out_, Table::FromRecordBatches(schema_, batches_)); + return Status::OK(); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index 30526cb835a..ea0c8cf36f1 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -19,11 +19,14 @@ #include #include +#include #include #include #include #include "arrow/buffer.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/options.h" #include "arrow/compute/type_fwd.h" #include "arrow/memory_pool.h" #include "arrow/result.h" @@ -32,7 +35,6 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" #include "arrow/util/mutex.h" -#include "arrow/util/optional.h" #include "arrow/util/thread_pool.h" #if defined(__clang__) || defined(__GNUC__) @@ -245,7 +247,7 @@ class ARROW_EXPORT AtomicCounter { int count() const { return count_.load(); } - util::optional total() const { + std::optional total() const { int total = total_.load(); if (total == -1) return {}; return total; @@ -342,5 +344,85 @@ class TailSkipForSIMD { } }; +/// \brief A consumer that collects results into an in-memory table +struct ARROW_EXPORT TableSinkNodeConsumer : public SinkNodeConsumer { + public: + TableSinkNodeConsumer(std::shared_ptr
* out, MemoryPool* pool) + : out_(out), pool_(pool) {} + Status Init(const std::shared_ptr& schema, + BackpressureControl* backpressure_control, ExecPlan* plan) override; + Status Consume(ExecBatch batch) override; + Future<> Finish() override; + + private: + std::shared_ptr
* out_; + MemoryPool* pool_; + std::shared_ptr schema_; + std::vector> batches_; + util::Mutex consume_mutex_; +}; + +class ARROW_EXPORT NullSinkNodeConsumer : public SinkNodeConsumer { + public: + Status Init(const std::shared_ptr&, BackpressureControl*, + ExecPlan* plan) override { + return Status::OK(); + } + Status Consume(ExecBatch exec_batch) override { return Status::OK(); } + Future<> Finish() override { return Status::OK(); } + + public: + static std::shared_ptr Make() { + return std::make_shared(); + } +}; + +/// Modify an Expression with pre-order and post-order visitation. +/// `pre` will be invoked on each Expression. `pre` will visit Calls before their +/// arguments, `post_call` will visit Calls (and no other Expressions) after their +/// arguments. Visitors should return the Identical expression to indicate no change; this +/// will prevent unnecessary construction in the common case where a modification is not +/// possible/necessary/... +/// +/// If an argument was modified, `post_call` visits a reconstructed Call with the modified +/// arguments but also receives a pointer to the unmodified Expression as a second +/// argument. If no arguments were modified the unmodified Expression* will be nullptr. +template +Result ModifyExpression(Expression expr, const PreVisit& pre, + const PostVisitCall& post_call) { + ARROW_ASSIGN_OR_RAISE(expr, Result(pre(std::move(expr)))); + + auto call = expr.call(); + if (!call) return expr; + + bool at_least_one_modified = false; + std::vector modified_arguments; + + for (size_t i = 0; i < call->arguments.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto modified_argument, + ModifyExpression(call->arguments[i], pre, post_call)); + + if (Identical(modified_argument, call->arguments[i])) { + continue; + } + + if (!at_least_one_modified) { + modified_arguments = call->arguments; + at_least_one_modified = true; + } + + modified_arguments[i] = std::move(modified_argument); + } + + if (at_least_one_modified) { + // reconstruct the call expression with the modified arguments + auto modified_call = *call; + modified_call.arguments = std::move(modified_arguments); + return post_call(Expression(std::move(modified_call)), &expr); + } + + return post_call(std::move(expr), NULLPTR); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/util_test.cc b/cpp/src/arrow/compute/exec/util_test.cc index 3861446bb3c..ca5118dc1aa 100644 --- a/cpp/src/arrow/compute/exec/util_test.cc +++ b/cpp/src/arrow/compute/exec/util_test.cc @@ -17,6 +17,7 @@ #include "arrow/compute/exec/hash_join_node.h" #include "arrow/compute/exec/schema_util.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -128,5 +129,60 @@ TEST(FieldMap, TwoKeyFields) { }))); } +TEST(FieldMap, ExtensionTypeSwissJoin) { + // For simpler types swiss join will be used. + HashJoinSchema schema_mgr; + + auto left = schema({field("i32", int32()), field("ext", uuid())}); + auto right = schema({field("i32", int32())}); + + ASSERT_OK(schema_mgr.Init(JoinType::INNER, *left, {"i32"}, *right, {"i32"}, + literal(true), kLeftSuffix, kRightSuffix)); + + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::INPUT), 2); + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::KEY), 1); + EXPECT_EQ(schema_mgr.proj_maps[1].num_cols(HashJoinProjection::KEY), 1); + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::OUTPUT), 2); + + auto output = schema_mgr.MakeOutputSchema(kLeftSuffix, kRightSuffix); + EXPECT_THAT(*output, Eq(Schema({field("i32.left", int32()), field("ext", uuid()), + field("i32.right", int32())}))); + + auto i = + schema_mgr.proj_maps[0].map(HashJoinProjection::INPUT, HashJoinProjection::OUTPUT); + EXPECT_EQ(i.get(0), 0); +} + +TEST(FieldMap, ExtensionTypeHashJoin) { + // Swiss join doesn't support dictionaries so HashJoin will be used. + HashJoinSchema schema_mgr; + + auto dict_type = dictionary(int64(), int8()); + auto left = schema({field("i32", int32()), field("ext", uuid())}); + auto right = schema({field("i32", int32()), field("dict_type", dict_type)}); + + ASSERT_OK(schema_mgr.Init(JoinType::INNER, *left, {"i32"}, *right, {"i32"}, + literal(true), kLeftSuffix, kRightSuffix)); + + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::INPUT), 2); + EXPECT_EQ(schema_mgr.proj_maps[1].num_cols(HashJoinProjection::INPUT), 2); + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::KEY), 1); + EXPECT_EQ(schema_mgr.proj_maps[1].num_cols(HashJoinProjection::KEY), 1); + EXPECT_EQ(schema_mgr.proj_maps[0].num_cols(HashJoinProjection::OUTPUT), 2); + EXPECT_EQ(schema_mgr.proj_maps[1].num_cols(HashJoinProjection::OUTPUT), 2); + + auto output = schema_mgr.MakeOutputSchema(kLeftSuffix, kRightSuffix); + EXPECT_THAT(*output, Eq(Schema({ + field("i32.left", int32()), + field("ext", uuid()), + field("i32.right", int32()), + field("dict_type", dict_type), + }))); + + auto i = + schema_mgr.proj_maps[0].map(HashJoinProjection::INPUT, HashJoinProjection::OUTPUT); + EXPECT_EQ(i.get(0), 0); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index c31309da931..cab9bd6a1d6 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -35,6 +35,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/registry.h" #include "arrow/memory_pool.h" +#include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -43,7 +44,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" namespace arrow { @@ -56,6 +56,68 @@ using ::arrow::internal::BitmapEquals; using ::arrow::internal::CopyBitmap; using ::arrow::internal::CountSetBits; +TEST(ExecBatch, SliceBasics) { + int64_t length = 4, cut_length = 2, left_length = length - cut_length; + ExecBatch batch{{Int32Scalar(0), ArrayFromJSON(utf8(), R"(["a", "b", "c", "d"])"), + ChunkedArrayFromJSON(float64(), {"[1.1]", "[2.2]", "[3.3]", "[4.4]"})}, + length}; + std::vector expected_sliced{ + {{Int32Scalar(0), ArrayFromJSON(utf8(), R"(["a", "b"])"), + ChunkedArrayFromJSON(float64(), {"[1.1]", "[2.2]"})}, + cut_length}, + {{Int32Scalar(0), ArrayFromJSON(utf8(), R"(["c", "d"])"), + ChunkedArrayFromJSON(float64(), {"[3.3]", "[4.4]"})}, + left_length}}; + std::vector actual_sliced = {batch.Slice(0, cut_length), + batch.Slice(cut_length, left_length)}; + for (size_t i = 0; i < expected_sliced.size(); i++) { + ASSERT_EQ(expected_sliced[i].length, actual_sliced[i].length); + ASSERT_EQ(expected_sliced[i].values.size(), actual_sliced[i].values.size()); + for (size_t j = 0; j < expected_sliced[i].values.size(); j++) { + AssertDatumsEqual(expected_sliced[i].values[j], actual_sliced[i].values[j]); + } + ASSERT_EQ(expected_sliced[i].ToString(), actual_sliced[i].ToString()); + } +} + +TEST(ExecBatch, ToRecordBatch) { + auto i32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); + auto utf8_array = ArrayFromJSON(utf8(), R"(["a", "b", "c"])"); + ExecBatch exec_batch({Datum(i32_array), Datum(utf8_array)}, 3); + + auto right_schema = schema({field("a", int32()), field("b", utf8())}); + ASSERT_OK_AND_ASSIGN(auto right_record_batch, exec_batch.ToRecordBatch(right_schema)); + ASSERT_OK(right_record_batch->ValidateFull()); + auto expected_batch = RecordBatchFromJSON(right_schema, R"([ + {"a": 0, "b": "a"}, + {"a": 1, "b": "b"}, + {"a": 2, "b": "c"} + ])"); + AssertBatchesEqual(*right_record_batch, *expected_batch); + + // With a scalar column + auto utf8_scalar = ScalarFromJSON(utf8(), R"("z")"); + exec_batch = ExecBatch({Datum(i32_array), Datum(utf8_scalar)}, 3); + ASSERT_OK_AND_ASSIGN(right_record_batch, exec_batch.ToRecordBatch(right_schema)); + ASSERT_OK(right_record_batch->ValidateFull()); + expected_batch = RecordBatchFromJSON(right_schema, R"([ + {"a": 0, "b": "z"}, + {"a": 1, "b": "z"}, + {"a": 2, "b": "z"} + ])"); + AssertBatchesEqual(*right_record_batch, *expected_batch); + + // Wrong number of fields in schema + auto reject_schema = + schema({field("a", int32()), field("b", utf8()), field("c", float64())}); + ASSERT_RAISES(Invalid, exec_batch.ToRecordBatch(reject_schema)); + + // Wrong-kind exec batch (not really valid, but test it here anyway) + ExecBatch miskinded_batch({Datum()}, 0); + auto null_schema = schema({field("a", null())}); + ASSERT_RAISES(TypeError, miskinded_batch.ToRecordBatch(null_schema)); +} + TEST(ExecContext, BasicWorkings) { { ExecContext ctx; @@ -766,7 +828,7 @@ TEST_F(TestExecSpanIterator, ChunkedArrays) { } TEST_F(TestExecSpanIterator, ZeroLengthInputs) { - auto carr = std::shared_ptr(new ChunkedArray({}, int32())); + auto carr = std::make_shared(ArrayVector{}, int32()); auto CheckArgs = [&](const ExecBatch& batch) { ExecSpanIterator iterator; @@ -883,7 +945,7 @@ class ExampleOptionsType : public FunctionOptionsType { } std::unique_ptr Copy(const FunctionOptions& options) const override { const auto& opts = static_cast(options); - return arrow::internal::make_unique(opts.value); + return std::make_unique(opts.value); } }; ExampleOptions::ExampleOptions(std::shared_ptr value) @@ -897,7 +959,7 @@ struct ExampleState : public KernelState { Result> InitStateful(KernelContext*, const KernelInitArgs& args) { auto func_options = static_cast(args.options); - return std::unique_ptr(new ExampleState{func_options->value}); + return std::make_unique(func_options ? func_options->value : nullptr); } Status ExecStateful(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { @@ -1011,36 +1073,134 @@ class TestCallScalarFunction : public TestComputeInternals { bool TestCallScalarFunction::initialized_ = false; -TEST_F(TestCallScalarFunction, ArgumentValidation) { +class FunctionCaller { + public: + virtual ~FunctionCaller() = default; + + virtual Result Call(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx = NULLPTR) = 0; + virtual Result Call(const std::vector& args, + ExecContext* ctx = NULLPTR) = 0; +}; + +using FunctionCallerMaker = std::function>( + const std::string& func_name, std::vector in_types)>; + +class SimpleFunctionCaller : public FunctionCaller { + public: + explicit SimpleFunctionCaller(const std::string& func_name) : func_name(func_name) {} + + static Result> Make(const std::string& func_name) { + return std::make_shared(func_name); + } + + static Result> Maker(const std::string& func_name, + std::vector in_types) { + return Make(func_name); + } + + Result Call(const std::vector& args, const FunctionOptions* options, + ExecContext* ctx) override { + return CallFunction(func_name, args, options, ctx); + } + Result Call(const std::vector& args, ExecContext* ctx) override { + return CallFunction(func_name, args, ctx); + } + + std::string func_name; +}; + +class ExecFunctionCaller : public FunctionCaller { + public: + explicit ExecFunctionCaller(std::shared_ptr func_exec) + : func_exec(std::move(func_exec)) {} + + static Result> Make( + const std::string& func_name, const std::vector& args, + const FunctionOptions* options = nullptr, + FunctionRegistry* func_registry = nullptr) { + ARROW_ASSIGN_OR_RAISE(auto func_exec, + GetFunctionExecutor(func_name, args, options, func_registry)); + return std::make_shared(std::move(func_exec)); + } + + static Result> Make( + const std::string& func_name, std::vector in_types, + const FunctionOptions* options = nullptr, + FunctionRegistry* func_registry = nullptr) { + ARROW_ASSIGN_OR_RAISE( + auto func_exec, GetFunctionExecutor(func_name, in_types, options, func_registry)); + return std::make_shared(std::move(func_exec)); + } + + static Result> Maker(const std::string& func_name, + std::vector in_types) { + return Make(func_name, std::move(in_types)); + } + + Result Call(const std::vector& args, const FunctionOptions* options, + ExecContext* ctx) override { + ARROW_RETURN_NOT_OK(func_exec->Init(options, ctx)); + return func_exec->Execute(args); + } + Result Call(const std::vector& args, ExecContext* ctx) override { + return Call(args, nullptr, ctx); + } + + std::shared_ptr func_exec; +}; + +class TestCallScalarFunctionArgumentValidation : public TestCallScalarFunction { + protected: + void DoTest(FunctionCallerMaker caller_maker); +}; + +void TestCallScalarFunctionArgumentValidation::DoTest(FunctionCallerMaker caller_maker) { + ASSERT_OK_AND_ASSIGN(auto test_copy, caller_maker("test_copy", {int32()})); + // Copy accepts only a single array argument Datum d1(GetInt32Array(10)); // Too many args std::vector args = {d1, d1}; - ASSERT_RAISES(Invalid, CallFunction("test_copy", args)); + ASSERT_RAISES(Invalid, test_copy->Call(args)); // Too few args = {}; - ASSERT_RAISES(Invalid, CallFunction("test_copy", args)); + ASSERT_RAISES(Invalid, test_copy->Call(args)); // Cannot do scalar Datum d1_scalar(std::make_shared(5)); - ASSERT_OK_AND_ASSIGN(auto result, CallFunction("test_copy", {d1})); - ASSERT_OK_AND_ASSIGN(result, CallFunction("test_copy", {d1_scalar})); + ASSERT_OK_AND_ASSIGN(auto result, test_copy->Call({d1})); + ASSERT_OK_AND_ASSIGN(result, test_copy->Call({d1_scalar})); } -TEST_F(TestCallScalarFunction, PreallocationCases) { +TEST_F(TestCallScalarFunctionArgumentValidation, SimpleCall) { + TestCallScalarFunctionArgumentValidation::DoTest(SimpleFunctionCaller::Maker); +} + +TEST_F(TestCallScalarFunctionArgumentValidation, ExecCall) { + TestCallScalarFunctionArgumentValidation::DoTest(ExecFunctionCaller::Maker); +} + +class TestCallScalarFunctionPreallocationCases : public TestCallScalarFunction { + protected: + void DoTest(FunctionCallerMaker caller_maker); +}; + +void TestCallScalarFunctionPreallocationCases::DoTest(FunctionCallerMaker caller_maker) { double null_prob = 0.2; auto arr = GetUInt8Array(100, null_prob); - auto CheckFunction = [&](std::string func_name) { + auto CheckFunction = [&](std::shared_ptr test_copy) { ResetContexts(); // The default should be a single array output { std::vector args = {Datum(arr)}; - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args)); + ASSERT_OK_AND_ASSIGN(Datum result, test_copy->Call(args)); ASSERT_EQ(Datum::ARRAY, result.kind()); AssertArraysEqual(*arr, *result.make_array()); } @@ -1050,7 +1210,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { { std::vector args = {Datum(arr)}; exec_ctx_->set_exec_chunksize(80); - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); + ASSERT_OK_AND_ASSIGN(Datum result, test_copy->Call(args, exec_ctx_.get())); AssertArraysEqual(*arr, *result.make_array()); } @@ -1058,16 +1218,16 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { // Chunksize not multiple of 8 std::vector args = {Datum(arr)}; exec_ctx_->set_exec_chunksize(11); - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); + ASSERT_OK_AND_ASSIGN(Datum result, test_copy->Call(args, exec_ctx_.get())); AssertArraysEqual(*arr, *result.make_array()); } // Input is chunked, output has one big chunk { - auto carr = std::shared_ptr( - new ChunkedArray({arr->Slice(0, 10), arr->Slice(10)})); + auto carr = + std::make_shared(ArrayVector{arr->Slice(0, 10), arr->Slice(10)}); std::vector args = {Datum(carr)}; - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); + ASSERT_OK_AND_ASSIGN(Datum result, test_copy->Call(args, exec_ctx_.get())); std::shared_ptr actual = result.chunked_array(); ASSERT_EQ(1, actual->num_chunks()); AssertChunkedEquivalent(*carr, *actual); @@ -1078,7 +1238,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { std::vector args = {Datum(arr)}; exec_ctx_->set_preallocate_contiguous(false); exec_ctx_->set_exec_chunksize(40); - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); + ASSERT_OK_AND_ASSIGN(Datum result, test_copy->Call(args, exec_ctx_.get())); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); const ChunkedArray& carr = *result.chunked_array(); ASSERT_EQ(3, carr.num_chunks()); @@ -1088,11 +1248,28 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { } }; - CheckFunction("test_copy"); - CheckFunction("test_copy_computed_bitmap"); + ASSERT_OK_AND_ASSIGN(auto test_copy, caller_maker("test_copy", {uint8()})); + CheckFunction(test_copy); + ASSERT_OK_AND_ASSIGN(auto test_copy_computed_bitmap, + caller_maker("test_copy_computed_bitmap", {uint8()})); + CheckFunction(test_copy_computed_bitmap); } -TEST_F(TestCallScalarFunction, BasicNonStandardCases) { +TEST_F(TestCallScalarFunctionPreallocationCases, SimpleCaller) { + TestCallScalarFunctionPreallocationCases::DoTest(SimpleFunctionCaller::Maker); +} + +TEST_F(TestCallScalarFunctionPreallocationCases, ExecCaller) { + TestCallScalarFunctionPreallocationCases::DoTest(ExecFunctionCaller::Maker); +} + +class TestCallScalarFunctionBasicNonStandardCases : public TestCallScalarFunction { + protected: + void DoTest(FunctionCallerMaker caller_maker); +}; + +void TestCallScalarFunctionBasicNonStandardCases::DoTest( + FunctionCallerMaker caller_maker) { // Test a handful of cases // // * Validity bitmap computed by kernel rather than using PropagateNulls @@ -1104,19 +1281,19 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) { auto arr = GetUInt8Array(1000, null_prob); std::vector args = {Datum(arr)}; - auto CheckFunction = [&](std::string func_name) { + auto CheckFunction = [&](std::shared_ptr test_nopre) { ResetContexts(); // The default should be a single array output { - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args)); + ASSERT_OK_AND_ASSIGN(Datum result, test_nopre->Call(args)); AssertArraysEqual(*arr, *result.make_array(), true); } // Split execution into 3 chunks { exec_ctx_->set_exec_chunksize(400); - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); + ASSERT_OK_AND_ASSIGN(Datum result, test_nopre->Call(args, exec_ctx_.get())); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); const ChunkedArray& carr = *result.chunked_array(); ASSERT_EQ(3, carr.num_chunks()); @@ -1126,31 +1303,73 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) { } }; - CheckFunction("test_nopre_data"); - CheckFunction("test_nopre_validity_or_data"); + ASSERT_OK_AND_ASSIGN(auto test_nopre_data, caller_maker("test_nopre_data", {uint8()})); + CheckFunction(test_nopre_data); + ASSERT_OK_AND_ASSIGN(auto test_nopre_validity_or_data, + caller_maker("test_nopre_validity_or_data", {uint8()})); + CheckFunction(test_nopre_validity_or_data); +} + +TEST_F(TestCallScalarFunctionBasicNonStandardCases, SimpleCall) { + TestCallScalarFunctionBasicNonStandardCases::DoTest(SimpleFunctionCaller::Maker); } -TEST_F(TestCallScalarFunction, StatefulKernel) { +TEST_F(TestCallScalarFunctionBasicNonStandardCases, ExecCall) { + TestCallScalarFunctionBasicNonStandardCases::DoTest(ExecFunctionCaller::Maker); +} + +class TestCallScalarFunctionStatefulKernel : public TestCallScalarFunction { + protected: + void DoTest(FunctionCallerMaker caller_maker); +}; + +void TestCallScalarFunctionStatefulKernel::DoTest(FunctionCallerMaker caller_maker) { + ASSERT_OK_AND_ASSIGN(auto test_stateful, caller_maker("test_stateful", {int32()})); + auto input = ArrayFromJSON(int32(), "[1, 2, 3, null, 5]"); auto multiplier = std::make_shared(2); auto expected = ArrayFromJSON(int32(), "[2, 4, 6, null, 10]"); ExampleOptions options(multiplier); std::vector args = {Datum(input)}; - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("test_stateful", args, &options)); + ASSERT_OK_AND_ASSIGN(Datum result, test_stateful->Call(args, &options)); AssertArraysEqual(*expected, *result.make_array()); } -TEST_F(TestCallScalarFunction, ScalarFunction) { +TEST_F(TestCallScalarFunctionStatefulKernel, Simplecall) { + TestCallScalarFunctionStatefulKernel::DoTest(SimpleFunctionCaller::Maker); +} + +TEST_F(TestCallScalarFunctionStatefulKernel, ExecCall) { + TestCallScalarFunctionStatefulKernel::DoTest(ExecFunctionCaller::Maker); +} + +class TestCallScalarFunctionScalarFunction : public TestCallScalarFunction { + protected: + void DoTest(FunctionCallerMaker caller_maker); +}; + +void TestCallScalarFunctionScalarFunction::DoTest(FunctionCallerMaker caller_maker) { + ASSERT_OK_AND_ASSIGN(auto test_scalar_add_int32, + caller_maker("test_scalar_add_int32", {int32(), int32()})); + std::vector args = {Datum(std::make_shared(5)), Datum(std::make_shared(7))}; - ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("test_scalar_add_int32", args)); + ASSERT_OK_AND_ASSIGN(Datum result, test_scalar_add_int32->Call(args)); ASSERT_EQ(Datum::SCALAR, result.kind()); auto expected = std::make_shared(12); ASSERT_TRUE(expected->Equals(*result.scalar())); } +TEST_F(TestCallScalarFunctionScalarFunction, SimpleCall) { + TestCallScalarFunctionScalarFunction::DoTest(SimpleFunctionCaller::Maker); +} + +TEST_F(TestCallScalarFunctionScalarFunction, ExecCall) { + TestCallScalarFunctionScalarFunction::DoTest(ExecFunctionCaller::Maker); +} + } // namespace detail } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 12d80a8c9ae..90e754f6150 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -97,6 +97,18 @@ Status Function::CheckArity(size_t num_args) const { return CheckArityImpl(*this, static_cast(num_args)); } +namespace { + +Status CheckOptions(const Function& function, const FunctionOptions* options) { + if (options == nullptr && function.doc().options_required) { + return Status::Invalid("Function '", function.name(), + "' cannot be called without options"); + } + return Status::OK(); +} + +} // namespace + namespace detail { Status NoMatchingKernel(const Function* func, const std::vector& types) { @@ -167,6 +179,118 @@ const Kernel* DispatchExactImpl(const Function* func, return nullptr; } +struct FunctionExecutorImpl : public FunctionExecutor { + FunctionExecutorImpl(std::vector in_types, const Kernel* kernel, + std::unique_ptr executor, + const Function& func) + : in_types(std::move(in_types)), + kernel(kernel), + kernel_ctx(default_exec_context(), kernel), + executor(std::move(executor)), + func(func), + state(), + options(NULLPTR), + inited(false) {} + virtual ~FunctionExecutorImpl() {} + + Status KernelInit(const FunctionOptions* options) { + RETURN_NOT_OK(CheckOptions(func, options)); + if (options == NULLPTR) { + options = func.default_options(); + } + if (kernel->init) { + ARROW_ASSIGN_OR_RAISE(state, + kernel->init(&kernel_ctx, {kernel, in_types, options})); + kernel_ctx.SetState(state.get()); + } + + RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, in_types, options})); + this->options = options; + inited = true; + return Status::OK(); + } + + Status Init(const FunctionOptions* options, ExecContext* exec_ctx) override { + if (exec_ctx == NULLPTR) { + exec_ctx = default_exec_context(); + } + kernel_ctx = KernelContext{exec_ctx, kernel}; + return KernelInit(options); + } + + Result Execute(const std::vector& args, int64_t passed_length) override { + util::tracing::Span span; + + auto func_kind = func.kind(); + const auto& func_name = func.name(); + START_COMPUTE_SPAN(span, func_name, + {{"function.name", func_name}, + {"function.options", options ? options->ToString() : ""}, + {"function.kind", func_kind}}); + + if (in_types.size() != args.size()) { + return Status::Invalid("Execution of '", func_name, "' expected ", in_types.size(), + " arguments but got ", args.size()); + } + if (!inited) { + ARROW_RETURN_NOT_OK(Init(NULLPTR, default_exec_context())); + } + ExecContext* ctx = kernel_ctx.exec_context(); + // Cast arguments if necessary + std::vector args_with_cast(args.size()); + for (size_t i = 0; i != args.size(); ++i) { + const auto& in_type = in_types[i]; + auto arg = args[i]; + if (in_type != args[i].type()) { + ARROW_ASSIGN_OR_RAISE(arg, Cast(args[i], CastOptions::Safe(in_type), ctx)); + } + args_with_cast[i] = std::move(arg); + } + + detail::DatumAccumulator listener; + + ExecBatch input(std::move(args_with_cast), /*length=*/0); + if (input.num_values() == 0) { + if (passed_length != -1) { + input.length = passed_length; + } + } else { + bool all_same_length = false; + int64_t inferred_length = detail::InferBatchLength(input.values, &all_same_length); + input.length = inferred_length; + if (func_kind == Function::SCALAR) { + if (passed_length != -1 && passed_length != inferred_length) { + return Status::Invalid( + "Passed batch length for execution did not match actual" + " length of values for execution of scalar function '", + func_name, "'"); + } + } else if (func_kind == Function::VECTOR) { + auto vkernel = static_cast(kernel); + if (!all_same_length && vkernel->can_execute_chunkwise) { + return Status::Invalid("Arguments for execution of vector kernel function '", + func_name, "' must all be the same length"); + } + } + } + RETURN_NOT_OK(executor->Execute(input, &listener)); + const auto out = executor->WrapResults(input.values, listener.values()); +#ifndef NDEBUG + DCHECK_OK(executor->CheckResultType(out, func_name.c_str())); +#endif + return out; + } + + std::vector in_types; + const Kernel* kernel; + KernelContext kernel_ctx; + std::unique_ptr executor; + const Function& func; + std::unique_ptr state; + const FunctionOptions* options; + bool inited; +}; + } // namespace detail Result Function::DispatchExact( @@ -187,114 +311,34 @@ Result Function::DispatchBest(std::vector* values) co return DispatchExact(*values); } -namespace { - -Status CheckAllArrayOrScalar(const std::vector& values) { - for (const auto& value : values) { - if (!value.is_value()) { - return Status::Invalid("Tried executing function with non-value type: ", - value.ToString()); - } - } - return Status::OK(); -} - -Status CheckOptions(const Function& function, const FunctionOptions* options) { - if (options == nullptr && function.doc().options_required) { - return Status::Invalid("Function '", function.name(), - "' cannot be called without options"); - } - return Status::OK(); -} - -Result ExecuteInternal(const Function& func, std::vector args, - int64_t passed_length, const FunctionOptions* options, - ExecContext* ctx) { - std::unique_ptr default_ctx; - if (options == nullptr) { - RETURN_NOT_OK(CheckOptions(func, options)); - options = func.default_options(); - } - if (ctx == nullptr) { - default_ctx.reset(new ExecContext()); - ctx = default_ctx.get(); - } - - util::tracing::Span span; - - START_COMPUTE_SPAN(span, func.name(), - {{"function.name", func.name()}, - {"function.options", options ? options->ToString() : ""}, - {"function.kind", func.kind()}}); - - // type-check Datum arguments here. Really we'd like to avoid this as much as - // possible - RETURN_NOT_OK(CheckAllArrayOrScalar(args)); - std::vector in_types(args.size()); - for (size_t i = 0; i != args.size(); ++i) { - in_types[i] = args[i].type().get(); - } - +Result> Function::GetBestExecutor( + std::vector inputs) const { std::unique_ptr executor; - if (func.kind() == Function::SCALAR) { + if (kind() == Function::SCALAR) { executor = detail::KernelExecutor::MakeScalar(); - } else if (func.kind() == Function::VECTOR) { + } else if (kind() == Function::VECTOR) { executor = detail::KernelExecutor::MakeVector(); - } else if (func.kind() == Function::SCALAR_AGGREGATE) { + } else if (kind() == Function::SCALAR_AGGREGATE) { executor = detail::KernelExecutor::MakeScalarAggregate(); } else { return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions"); } - ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, func.DispatchBest(&in_types)); - - // Cast arguments if necessary - for (size_t i = 0; i != args.size(); ++i) { - if (in_types[i] != args[i].type()) { - ARROW_ASSIGN_OR_RAISE(args[i], Cast(args[i], CastOptions::Safe(in_types[i]), ctx)); - } - } - - KernelContext kernel_ctx{ctx, kernel}; + ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, DispatchBest(&inputs)); - std::unique_ptr state; - if (kernel->init) { - ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, in_types, options})); - kernel_ctx.SetState(state.get()); - } - - RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, in_types, options})); + return std::make_shared(std::move(inputs), kernel, + std::move(executor), *this); +} - detail::DatumAccumulator listener; +namespace { - ExecBatch input(std::move(args), /*length=*/0); - if (input.num_values() == 0) { - if (passed_length != -1) { - input.length = passed_length; - } - } else { - bool all_same_length = false; - int64_t inferred_length = detail::InferBatchLength(input.values, &all_same_length); - input.length = inferred_length; - if (func.kind() == Function::SCALAR) { - if (passed_length != -1 && passed_length != inferred_length) { - return Status::Invalid( - "Passed batch length for execution did not match actual" - " length of values for scalar function execution"); - } - } else if (func.kind() == Function::VECTOR) { - auto vkernel = static_cast(kernel); - if (!(all_same_length || !vkernel->can_execute_chunkwise)) { - return Status::Invalid("Vector kernel arguments must all be the same length"); - } - } - } - RETURN_NOT_OK(executor->Execute(input, &listener)); - const auto out = executor->WrapResults(input.values, listener.values()); -#ifndef NDEBUG - DCHECK_OK(executor->CheckResultType(out, func.name().c_str())); -#endif - return out; +Result ExecuteInternal(const Function& func, std::vector args, + int64_t passed_length, const FunctionOptions* options, + ExecContext* ctx) { + ARROW_ASSIGN_OR_RAISE(auto inputs, internal::GetFunctionArgumentTypes(args)); + ARROW_ASSIGN_OR_RAISE(auto func_exec, func.GetBestExecutor(inputs)); + ARROW_RETURN_NOT_OK(func_exec->Init(options, ctx)); + return func_exec->Execute(args, passed_length); } } // namespace diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 7f2fba68caf..8a1b0da424a 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -159,6 +159,29 @@ struct ARROW_EXPORT FunctionDoc { static const FunctionDoc& Empty(); }; +/// \brief An executor of a function with a preconfigured kernel +class ARROW_EXPORT FunctionExecutor { + public: + virtual ~FunctionExecutor() = default; + /// \brief Initialize or re-initialize the preconfigured kernel + /// + /// This method may be called zero or more times. Depending on how + /// the FunctionExecutor was obtained, it may already have been initialized. + virtual Status Init(const FunctionOptions* options = NULLPTR, + ExecContext* exec_ctx = NULLPTR) = 0; + /// \brief Execute the preconfigured kernel with arguments that must fit it + /// + /// The method requires the arguments be castable to the preconfigured types. + /// + /// \param[in] args Arguments to execute the function on + /// \param[in] length Length of arguments batch or -1 to default it. If the + /// function has no parameters, this determines the batch length, defaulting + /// to 0. Otherwise, if the function is scalar, this must equal the argument + /// batch's inferred length or be -1 to default to it. This is ignored for + /// vector functions. + virtual Result Execute(const std::vector& args, int64_t length = -1) = 0; +}; + /// \brief Base class for compute functions. Function implementations contain a /// collection of "kernels" which are implementations of the function for /// specific argument types. Selecting a viable kernel for executing a function @@ -225,6 +248,13 @@ class ARROW_EXPORT Function { /// required by the kernel. virtual Result DispatchBest(std::vector* values) const; + /// \brief Get a function executor with a best-matching kernel + /// + /// The returned executor will by default work with the default FunctionOptions + /// and KernelContext. If you want to change that, call `FunctionExecutor::Init`. + virtual Result> GetBestExecutor( + std::vector inputs) const; + /// \brief Execute the function eagerly with the passed input arguments with /// kernel dispatch, batch iteration, and memory allocation details taken /// care of. diff --git a/cpp/src/arrow/compute/function_internal.cc b/cpp/src/arrow/compute/function_internal.cc index 0a926e0a39c..cd73462e953 100644 --- a/cpp/src/arrow/compute/function_internal.cc +++ b/cpp/src/arrow/compute/function_internal.cc @@ -108,6 +108,27 @@ Result> DeserializeFunctionOptions( return FunctionOptionsFromStructScalar(scalar); } +Status CheckAllArrayOrScalar(const std::vector& values) { + for (const auto& value : values) { + if (!value.is_value()) { + return Status::TypeError( + "Tried executing function with non-array, non-scalar type: ", value.ToString()); + } + } + return Status::OK(); +} + +Result> GetFunctionArgumentTypes(const std::vector& args) { + // type-check Datum arguments here. Really we'd like to avoid this as much as + // possible + RETURN_NOT_OK(CheckAllArrayOrScalar(args)); + std::vector inputs(args.size()); + for (size_t i = 0; i != args.size(); ++i) { + inputs[i] = TypeHolder(args[i].type()); + } + return inputs; +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h index 17261332619..cbf9d828741 100644 --- a/cpp/src/arrow/compute/function_internal.h +++ b/cpp/src/arrow/compute/function_internal.h @@ -74,7 +74,7 @@ Result ValidateEnumValue(CType raw) { return Status::Invalid("Invalid value for ", EnumTraits::name(), ": ", raw); } -class GenericOptionsType : public FunctionOptionsType { +class ARROW_EXPORT GenericOptionsType : public FunctionOptionsType { public: Result> Serialize(const FunctionOptions&) const override; Result> Deserialize( @@ -103,6 +103,12 @@ static inline enable_if_t::value, std::string> GenericToStri return ss.str(); } +template +static inline enable_if_t::value, std::string> GenericToString( + const std::optional& value) { + return value.has_value() ? GenericToString(value.value()) : "nullopt"; +} + static inline std::string GenericToString(bool value) { return value ? "true" : "false"; } static inline std::string GenericToString(const std::string& value) { @@ -277,6 +283,12 @@ static inline Result()))> GenericToScalar( return MakeScalar(value); } +template +static inline Result()))> GenericToScalar( + const std::optional& value) { + return value.has_value() ? MakeScalar(value.value()) : MakeScalar(nullptr); +} + // For Clang/libc++: when iterating through vector, we can't // pass it by reference so the overload above doesn't apply static inline Result> GenericToScalar(bool value) { @@ -392,6 +404,26 @@ GenericFromScalar(const std::shared_ptr& value) { return ValidateEnumValue(raw_val); } +template +constexpr bool is_optional_impl = false; +template +constexpr bool is_optional_impl> = true; + +template +using is_optional = + std::integral_constant> || + std::is_same::value>; + +template +using enable_if_optional = enable_if_t::value, Result>; + +template +static inline enable_if_optional GenericFromScalar( + const std::shared_ptr& value) { + using value_type = typename T::value_type; + return GenericFromScalar(value); +} + template using enable_if_same_result = enable_if_same>; @@ -647,13 +679,13 @@ const FunctionOptionsType* GetFunctionOptionsType(const Properties&... propertie } Result> FromStructScalar( const StructScalar& scalar) const override { - auto options = std::unique_ptr(new Options()); + auto options = std::make_unique(); RETURN_NOT_OK( FromStructScalarImpl(options.get(), scalar, properties_).status_); return std::move(options); } std::unique_ptr Copy(const FunctionOptions& options) const override { - auto out = std::unique_ptr(new Options()); + auto out = std::make_unique(); CopyImpl(out.get(), checked_cast(options), properties_); return std::move(out); } @@ -664,6 +696,11 @@ const FunctionOptionsType* GetFunctionOptionsType(const Properties&... propertie return &instance; } +Status CheckAllArrayOrScalar(const std::vector& values); + +ARROW_EXPORT +Result> GetFunctionArgumentTypes(const std::vector& args); + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index ea151e81f0b..b71e5a12b50 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -23,16 +23,20 @@ #include #include +#include "arrow/array/builder_primitive.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/type.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" namespace arrow { namespace compute { @@ -351,5 +355,106 @@ TEST(ScalarAggregateFunction, DispatchExact) { ASSERT_TRUE(selected_kernel->signature->MatchesInputs(dispatch_args)); } +namespace { + +struct TestFunctionOptions : public FunctionOptions { + TestFunctionOptions(); + + static const char* kTypeName; + + int value; +}; + +static auto kTestFunctionOptionsType = + internal::GetFunctionOptionsType(); + +TestFunctionOptions::TestFunctionOptions() : FunctionOptions(kTestFunctionOptionsType) {} + +const char* TestFunctionOptions::kTypeName = "test_options"; + +} // namespace + +TEST(FunctionExecutor, Basics) { + VectorFunction func("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); + int init_calls = 0; + int expected_optval = 0; + ExecContext exec_ctx; + TestFunctionOptions options; + options.value = 1; + auto init = + [&](KernelContext* kernel_ctx, + const KernelInitArgs& init_args) -> Result> { + if (&exec_ctx != kernel_ctx->exec_context()) { + return Status::Invalid("expected exec context not found in kernel context"); + } + if (init_args.options != nullptr) { + const auto* test_opts = checked_cast(init_args.options); + if (test_opts->value != expected_optval) { + return Status::Invalid("bad options value"); + } + } + if (&options != init_args.options) { + return Status::Invalid("expected options not found in kernel init args"); + } + ++init_calls; + return nullptr; + }; + auto exec = [](KernelContext* ctx, const ExecSpan& args, ExecResult* out) -> Status { + [&]() { // gtest ASSERT macros require a void function + ASSERT_EQ(2, args.values.size()); + const int32_t* vals[2]; + for (size_t i = 0; i < 2; i++) { + ASSERT_TRUE(args.values[i].is_array()); + const ArraySpan& array = args.values[i].array; + ASSERT_EQ(array.type->id(), Type::INT32); + vals[i] = array.GetValues(1); + } + ASSERT_TRUE(out->is_array_data()); + auto out_data = out->array_data(); + Int32Builder builder; + for (int64_t i = 0; i < args.length; i++) { + ASSERT_OK(builder.Append(vals[0][i] + vals[1][i])); + } + ASSERT_OK_AND_ASSIGN(auto array, builder.Finish()); + *out_data.get() = *array->data(); + }(); + return Status::OK(); + }; + std::vector in_types = {int32(), int32()}; + OutputType out_type = int32(); + ASSERT_OK(func.AddKernel(in_types, out_type, exec, init)); + + ASSERT_OK_AND_ASSIGN(const Kernel* dispatched, func.DispatchExact({int32(), int32()})); + ASSERT_EQ(exec, static_cast(dispatched)->exec); + std::vector inputs = {int32(), int32()}; + + ASSERT_OK_AND_ASSIGN(auto func_exec, func.GetBestExecutor(inputs)); + ASSERT_EQ(0, init_calls); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("options not found"), + func_exec->Init(nullptr, &exec_ctx)); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("bad options value"), + func_exec->Init(&options, &exec_ctx)); + ExecContext other_exec_ctx; + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("exec context not found"), + func_exec->Init(&options, &other_exec_ctx)); + + ArrayVector arrays = {ArrayFromJSON(int32(), "[1]"), ArrayFromJSON(int32(), "[2]"), + ArrayFromJSON(int32(), "[3]"), ArrayFromJSON(int32(), "[4]")}; + ArrayVector expected = {ArrayFromJSON(int32(), "[3]"), ArrayFromJSON(int32(), "[5]"), + ArrayFromJSON(int32(), "[7]")}; + for (int n = 1; n <= 3; n++) { + expected_optval = options.value = n; + ASSERT_OK(func_exec->Init(&options, &exec_ctx)); + ASSERT_EQ(n, init_calls); + for (int32_t i = 1; i <= 3; i++) { + std::vector values = {arrays[i - 1], arrays[i]}; + ASSERT_OK_AND_ASSIGN(auto result, func_exec->Execute(values, 1)); + ASSERT_TRUE(result.is_array()); + auto actual = result.make_array(); + AssertArraysEqual(*expected[i - 1], *actual); + } + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 780699886d2..5eadf5d0ea0 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -27,6 +27,7 @@ add_arrow_compute_test(scalar_test scalar_if_else_test.cc scalar_nested_test.cc scalar_random_test.cc + scalar_round_arithmetic_test.cc scalar_set_lookup_test.cc scalar_string_test.cc scalar_temporal_test.cc @@ -39,6 +40,7 @@ add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_random_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_temporal_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 400ccbdf9f6..c2ea04d492b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -22,7 +22,8 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" -#include "arrow/util/make_unique.h" + +#include namespace arrow { namespace compute { @@ -119,8 +120,7 @@ struct CountImpl : public ScalarAggregator { Result> CountInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique( - static_cast(*args.options)); + return std::make_unique(static_cast(*args.options)); } // ---------------------------------------------------------------------- @@ -194,7 +194,7 @@ struct CountDistinctImpl : public ScalarAggregator { template Result> CountDistinctInit(KernelContext* ctx, const KernelInitArgs& args) { - return ::arrow::internal::make_unique>( + return std::make_unique>( ctx->memory_pool(), static_cast(*args.options)); } @@ -233,11 +233,11 @@ void AddCountDistinctKernels(ScalarAggregateFunction* func) { AddCountDistinctKernel(day_time_interval(), func); AddCountDistinctKernel(month_day_nano_interval(), func); // Binary & String - AddCountDistinctKernel(match::BinaryLike(), func); - AddCountDistinctKernel(match::LargeBinaryLike(), - func); + AddCountDistinctKernel(match::BinaryLike(), func); + AddCountDistinctKernel(match::LargeBinaryLike(), + func); // Fixed binary & Decimal - AddCountDistinctKernel( + AddCountDistinctKernel( match::FixedSizeBinaryLike(), func); } @@ -516,7 +516,7 @@ struct BooleanAnyImpl : public ScalarAggregator { Result> AnyInit(KernelContext*, const KernelInitArgs& args) { const ScalarAggregateOptions options = static_cast(*args.options); - return ::arrow::internal::make_unique( + return std::make_unique( static_cast(*args.options)); } @@ -586,7 +586,7 @@ struct BooleanAllImpl : public ScalarAggregator { }; Result> AllInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique( + return std::make_unique( static_cast(*args.options)); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index bd2fe534608..aa89f8dc3b4 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -360,7 +360,7 @@ struct MinMaxState> { return *this; } - void MergeOne(util::string_view value) { + void MergeOne(std::string_view value) { MergeOne(T(reinterpret_cast(value.data()))); } @@ -398,14 +398,14 @@ struct MinMaxStatemin = std::string(value); this->max = std::string(value); } else { - if (value < util::string_view(this->min)) { + if (value < std::string_view(this->min)) { this->min = std::string(value); - } else if (value > util::string_view(this->max)) { + } else if (value > std::string_view(this->max)) { this->max = std::string(value); } } diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index 8db74bfe0cd..8fd67485d7f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -164,7 +164,8 @@ enable_if_t::value, SumType> SumArray( // reduce summation of one block (may be smaller than kBlockSize) from leaf node // continue reducing to upper level if two summations are ready for non-leaf node - auto reduce = [&](SumType block_sum) { + // (capture `levels` by value because of ARROW-17567) + auto reduce = [&, levels](SumType block_sum) { int cur_level = 0; uint64_t cur_level_mask = 1ULL; sum[cur_level] += block_sum; diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 8f400b2d249..c7ae70e2108 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -942,12 +942,12 @@ class TestCountDistinctKernel : public ::testing::Test { CheckScalar("count_distinct", {input}, Expected(expected_all), &all); } - void Check(const std::shared_ptr& type, util::string_view json, + void Check(const std::shared_ptr& type, std::string_view json, int64_t expected_all, bool has_nulls = true) { Check(ArrayFromJSON(type, json), expected_all, has_nulls); } - void Check(const std::shared_ptr& type, util::string_view json) { + void Check(const std::shared_ptr& type, std::string_view json) { auto input = ScalarFromJSON(type, json); auto zero = ResultWith(Expected(0)); auto one = ResultWith(Expected(1)); diff --git a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h index f416881ccb8..1cccdca1481 100644 --- a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h +++ b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h @@ -485,6 +485,14 @@ struct NegateChecked { } }; +struct Exp { + template + static T Call(KernelContext*, Arg exp, Status*) { + static_assert(std::is_same::value, ""); + return std::exp(exp); + } +}; + struct Power { ARROW_NOINLINE static uint64_t IntegerPower(uint64_t base, uint64_t exp) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index a6ede14176c..dd40b7ae2b0 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include @@ -45,9 +47,6 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/make_unique.h" -#include "arrow/util/optional.h" -#include "arrow/util/string_view.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -75,7 +74,7 @@ struct OptionsWrapper : public KernelState { static Result> Init(KernelContext* ctx, const KernelInitArgs& args) { if (auto options = static_cast(args.options)) { - return ::arrow::internal::make_unique(*options); + return std::make_unique(*options); } return Status::Invalid( @@ -101,8 +100,7 @@ struct KernelStateFromFunctionOptions : public KernelState { static Result> Init(KernelContext* ctx, const KernelInitArgs& args) { if (auto options = static_cast(args.options)) { - return ::arrow::internal::make_unique(ctx, - *options); + return std::make_unique(ctx, *options); } return Status::Invalid( @@ -136,7 +134,7 @@ struct GetViewType> { template struct GetViewType::value || is_fixed_size_binary_type::value>> { - using T = util::string_view; + using T = std::string_view; using PhysicalType = T; static T LogicalValue(PhysicalType value) { return value; } @@ -145,7 +143,7 @@ struct GetViewType::value || template <> struct GetViewType { using T = Decimal128; - using PhysicalType = util::string_view; + using PhysicalType = std::string_view; static T LogicalValue(PhysicalType value) { return Decimal128(reinterpret_cast(value.data())); @@ -157,7 +155,7 @@ struct GetViewType { template <> struct GetViewType { using T = Decimal256; - using PhysicalType = util::string_view; + using PhysicalType = std::string_view; static T LogicalValue(PhysicalType value) { return Decimal256(reinterpret_cast(value.data())); @@ -271,9 +269,9 @@ struct ArrayIterator> { data(reinterpret_cast(arr.buffers[2].data)), position(0) {} - util::string_view operator()() { + std::string_view operator()() { offset_type next_offset = offsets[++position]; - auto result = util::string_view(data + cur_offset, next_offset - cur_offset); + auto result = std::string_view(data + cur_offset, next_offset - cur_offset); cur_offset = next_offset; return result; } @@ -292,8 +290,8 @@ struct ArrayIterator { width(arr.type->byte_width()), position(arr.offset) {} - util::string_view operator()() { - auto result = util::string_view(data + position * width, width); + std::string_view operator()() { + auto result = std::string_view(data + position * width, width); position++; return result; } @@ -331,7 +329,7 @@ template struct UnboxScalar> { using T = typename Type::c_type; static T Unbox(const Scalar& val) { - util::string_view view = + std::string_view view = checked_cast(val).view(); DCHECK_EQ(view.size(), sizeof(T)); return *reinterpret_cast(view.data()); @@ -340,9 +338,9 @@ struct UnboxScalar> { template struct UnboxScalar> { - using T = util::string_view; + using T = std::string_view; static T Unbox(const Scalar& val) { - if (!val.is_valid) return util::string_view(); + if (!val.is_valid) return std::string_view(); return checked_cast(val).view(); } }; @@ -401,7 +399,7 @@ struct BoxScalar { }; // A VisitArraySpanInline variant that calls its visitor function with logical -// values, such as Decimal128 rather than util::string_view. +// values, such as Decimal128 rather than std::string_view. template static typename ::arrow::internal::call_traits::enable_if_return::type diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h index 21244320f38..bf90d114512 100644 --- a/cpp/src/arrow/compute/kernels/common.h +++ b/cpp/src/arrow/compute/kernels/common.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/string_view.h" // IWYU pragma: end_exports diff --git a/cpp/src/arrow/compute/kernels/copy_data_internal.h b/cpp/src/arrow/compute/kernels/copy_data_internal.h index 2e13563980c..a4083e7e065 100644 --- a/cpp/src/arrow/compute/kernels/copy_data_internal.h +++ b/cpp/src/arrow/compute/kernels/copy_data_internal.h @@ -58,7 +58,7 @@ struct CopyDataUtils { if (!scalar.is_valid) { std::memset(begin, 0x00, width * length); } else { - const util::string_view buffer = scalar.view(); + const std::string_view buffer = scalar.view(); DCHECK_GE(buffer.size(), static_cast(width)); for (int i = 0; i < length; i++) { std::memcpy(begin, buffer.data(), width); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 4537c32eb38..fe2b4af2059 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -49,7 +49,6 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/int128_internal.h" #include "arrow/util/int_util_overflow.h" -#include "arrow/util/make_unique.h" #include "arrow/util/task_group.h" #include "arrow/util/tdigest.h" #include "arrow/util/thread_pool.h" @@ -84,7 +83,7 @@ struct GroupedAggregator : KernelState { template Result> HashAggregateInit(KernelContext* ctx, const KernelInitArgs& args) { - auto impl = ::arrow::internal::make_unique(); + auto impl = std::make_unique(); RETURN_NOT_OK(impl->Init(ctx->exec_context(), args)); return std::move(impl); } @@ -972,7 +971,7 @@ struct GroupedVarStdImpl : public GroupedAggregator { template Result> VarStdInit(KernelContext* ctx, const KernelInitArgs& args) { - auto impl = ::arrow::internal::make_unique>(); + auto impl = std::make_unique>(); impl->result_type_ = result_type; RETURN_NOT_OK(impl->Init(ctx->exec_context(), args)); return std::move(impl); @@ -1373,7 +1372,7 @@ struct GroupedMinMaxImpl( batch, - [&](uint32_t g, util::string_view val) { + [&](uint32_t g, std::string_view val) { if (!mins_[g] || val < *mins_[g]) { mins_[g].emplace(val.data(), val.size(), allocator_); } @@ -1435,7 +1434,7 @@ struct GroupedMinMaxImpl enable_if_base_binary MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { using offset_type = typename T::offset_type; ARROW_ASSIGN_OR_RAISE( auto raw_offsets, @@ -1447,7 +1446,7 @@ struct GroupedMinMaxImpl& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); if (value->size() > static_cast(std::numeric_limits::max()) || @@ -1463,7 +1462,7 @@ struct GroupedMinMaxImpl& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), value->size()); offset += value->size(); @@ -1476,7 +1475,7 @@ struct GroupedMinMaxImpl enable_if_same MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { const uint8_t* null_bitmap = array->buffers[0]->data(); const int32_t slot_width = checked_cast(*array->type).byte_width(); @@ -1485,7 +1484,7 @@ struct GroupedMinMaxImpl& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), slot_width); } else { @@ -1504,7 +1503,7 @@ struct GroupedMinMaxImpl> mins_, maxes_; + std::vector> mins_, maxes_; TypedBufferBuilder has_values_, has_nulls_; std::shared_ptr type_; ScalarAggregateOptions options_; @@ -2092,7 +2091,7 @@ struct GroupedOneImpl::value || Status Consume(const ExecSpan& batch) override { return VisitGroupedValues( batch, - [&](uint32_t g, util::string_view val) -> Status { + [&](uint32_t g, std::string_view val) -> Status { if (!bit_util::GetBit(has_one_.data(), g)) { ones_[g].emplace(val.data(), val.size(), allocator_); bit_util::SetBit(has_one_.mutable_data(), g); @@ -2128,7 +2127,7 @@ struct GroupedOneImpl::value || template enable_if_base_binary MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { using offset_type = typename T::offset_type; ARROW_ASSIGN_OR_RAISE( auto raw_offsets, @@ -2140,7 +2139,7 @@ struct GroupedOneImpl::value || offset_type total_length = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); if (value->size() > static_cast(std::numeric_limits::max()) || @@ -2156,7 +2155,7 @@ struct GroupedOneImpl::value || int64_t offset = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), value->size()); offset += value->size(); @@ -2169,7 +2168,7 @@ struct GroupedOneImpl::value || template enable_if_same MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { const uint8_t* null_bitmap = array->buffers[0]->data(); const int32_t slot_width = checked_cast(*array->type).byte_width(); @@ -2178,7 +2177,7 @@ struct GroupedOneImpl::value || int64_t offset = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), slot_width); } else { @@ -2195,7 +2194,7 @@ struct GroupedOneImpl::value || ExecContext* ctx_; Allocator allocator_; int64_t num_groups_; - std::vector> ones_; + std::vector> ones_; TypedBufferBuilder has_one_; std::shared_ptr out_type_; }; @@ -2419,7 +2418,7 @@ struct GroupedListImpl::value || num_args_ += num_values; return VisitGroupedValues( batch, - [&](uint32_t group, util::string_view val) -> Status { + [&](uint32_t group, std::string_view val) -> Status { values_.emplace_back(StringType(val.data(), val.size(), allocator_)); return Status::OK(); }, @@ -2467,7 +2466,7 @@ struct GroupedListImpl::value || template enable_if_base_binary MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { using offset_type = typename T::offset_type; ARROW_ASSIGN_OR_RAISE( auto raw_offsets, @@ -2479,7 +2478,7 @@ struct GroupedListImpl::value || offset_type total_length = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); if (value->size() > static_cast(std::numeric_limits::max()) || @@ -2495,7 +2494,7 @@ struct GroupedListImpl::value || int64_t offset = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), value->size()); offset += value->size(); @@ -2508,7 +2507,7 @@ struct GroupedListImpl::value || template enable_if_same MakeOffsetsValues( - ArrayData* array, const std::vector>& values) { + ArrayData* array, const std::vector>& values) { const uint8_t* null_bitmap = array->buffers[0]->data(); const int32_t slot_width = checked_cast(*array->type).byte_width(); @@ -2517,7 +2516,7 @@ struct GroupedListImpl::value || int64_t offset = 0; for (size_t i = 0; i < values.size(); i++) { if (bit_util::GetBit(null_bitmap, i)) { - const util::optional& value = values[i]; + const std::optional& value = values[i]; DCHECK(value.has_value()); std::memcpy(data->mutable_data() + offset, value->data(), slot_width); } else { @@ -2534,7 +2533,7 @@ struct GroupedListImpl::value || ExecContext* ctx_; Allocator allocator_; int64_t num_groups_, num_args_ = 0; - std::vector> values_; + std::vector> values_; TypedBufferBuilder groups_; TypedBufferBuilder values_bitmap_; std::shared_ptr out_type_; diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index f599f9abb60..50d8cd49aba 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -124,14 +124,14 @@ Result NaiveGroupBy(std::vector arguments, std::vector keys Result GroupByUsingExecPlan(const BatchesWithSchema& input, const std::vector& key_names, const std::vector& aggregates, - bool use_threads, ExecContext* ctx) { + bool use_threads) { std::vector keys(key_names.size()); for (size_t i = 0; i < key_names.size(); ++i) { keys[i] = FieldRef(key_names[i]); } - ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(ctx)); - AsyncGenerator> sink_gen; + ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(*threaded_exec_context())); + AsyncGenerator> sink_gen; RETURN_NOT_OK( Declaration::Sequence( { @@ -148,11 +148,13 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, auto collected_fut = CollectAsyncGenerator(sink_gen); auto start_and_collect = - AllComplete({plan->finished(), Future<>(collected_fut)}) + AllFinished({plan->finished(), Future<>(collected_fut)}) .Then([collected_fut]() -> Result> { ARROW_ASSIGN_OR_RAISE(auto collected, collected_fut.result()); return ::arrow::internal::MapVector( - [](util::optional batch) { return std::move(*batch); }, + [](std::optional batch) { + return batch.value_or(ExecBatch()); + }, std::move(collected)); }); @@ -176,14 +178,37 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, } } - return StructArray::Make(std::move(out_arrays), output_schema->fields()); + // The exec plan may reorder the output rows. The tests are all setup to expect ouptut + // in ascending order of keys. So we need to sort the result by the key columns. To do + // that we create a table using the key columns, calculate the sort indices from that + // table (sorting on all fields) and then use those indices to calculate our result. + std::vector> key_fields; + std::vector> key_columns; + std::vector sort_keys; + for (std::size_t i = 0; i < key_names.size(); i++) { + const std::shared_ptr& arr = out_arrays[i + aggregates.size()]; + key_columns.push_back(arr); + key_fields.push_back(field("name_does_not_matter", arr->type())); + sort_keys.emplace_back(static_cast(i)); + } + std::shared_ptr key_schema = schema(std::move(key_fields)); + std::shared_ptr
key_table = Table::Make(std::move(key_schema), key_columns); + SortOptions sort_options(std::move(sort_keys)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr sort_indices, + SortIndices(key_table, sort_options)); + + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr struct_arr, + StructArray::Make(std::move(out_arrays), output_schema->fields())); + + return Take(struct_arr, sort_indices); } /// Simpler overload where you can give the columns as datums Result GroupByUsingExecPlan(const std::vector& arguments, const std::vector& keys, const std::vector& aggregates, - bool use_threads, ExecContext* ctx) { + bool use_threads) { using arrow::compute::detail::ExecSpanIterator; FieldVector scan_fields(arguments.size() + keys.size()); @@ -204,7 +229,7 @@ Result GroupByUsingExecPlan(const std::vector& arguments, ExecSpanIterator span_iterator; ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make(inputs)); - RETURN_NOT_OK(span_iterator.Init(batch, ctx->exec_chunksize())); + RETURN_NOT_OK(span_iterator.Init(batch)); BatchesWithSchema input; input.schema = schema(std::move(scan_fields)); ExecSpan span; @@ -213,7 +238,7 @@ Result GroupByUsingExecPlan(const std::vector& arguments, input.batches.push_back(span.ToExecBatch()); } - return GroupByUsingExecPlan(input, key_names, aggregates, use_threads, ctx); + return GroupByUsingExecPlan(input, key_names, aggregates, use_threads); } void ValidateGroupBy(const std::vector& aggregates, @@ -253,8 +278,7 @@ Result GroupByTest(const std::vector& arguments, idx = idx + 1; } if (use_exec_plan) { - return GroupByUsingExecPlan(arguments, keys, internal_aggregates, use_threads, - small_chunksize_context(use_threads)); + return GroupByUsingExecPlan(arguments, keys, internal_aggregates, use_threads); } else { return internal::GroupBy(arguments, keys, internal_aggregates, use_threads, default_exec_context()); @@ -880,7 +904,7 @@ TEST(GroupBy, CountScalar) { {"hash_count", keep_nulls, "argument", "hash_count"}, {"hash_count", count_all, "argument", "hash_count"}, }, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_count", int64()), field("hash_count", int64()), @@ -1093,7 +1117,7 @@ TEST(GroupBy, SumMeanProductScalar) { {"hash_mean", nullptr, "argument", "hash_mean"}, {"hash_product", nullptr, "argument", "hash_product"}, }, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_sum", int64()), field("hash_mean", float64()), @@ -1490,7 +1514,7 @@ TEST(GroupBy, StddevVarianceTDigestScalar) { {"hash_variance", nullptr, "argument1", "hash_variance"}, {"hash_tdigest", nullptr, "argument1", "hash_tdigest"}, }, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), @@ -1552,7 +1576,7 @@ TEST(GroupBy, VarianceOptions) { {"hash_variance", min_count, "argument", "hash_variance"}, {"hash_variance", keep_nulls_min_count, "argument", "hash_variance"}, }, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -1583,7 +1607,7 @@ TEST(GroupBy, VarianceOptions) { {"hash_variance", min_count, "argument1", "hash_variance"}, {"hash_variance", keep_nulls_min_count, "argument1", "hash_variance"}, }, - use_threads, default_exec_context())); + use_threads)); expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -2012,7 +2036,7 @@ TEST(GroupBy, MinMaxScalar) { Datum actual, GroupByUsingExecPlan(input, {"key"}, {{"hash_min_max", nullptr, "argument", "hash_min_max"}}, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_min_max", @@ -2142,7 +2166,7 @@ TEST(GroupBy, AnyAllScalar) { {"hash_any", keep_nulls, "argument", "hash_any"}, {"hash_all", keep_nulls, "argument", "hash_all"}, }, - use_threads, default_exec_context())); + use_threads)); Datum expected = ArrayFromJSON(struct_({ field("hash_any", boolean()), field("hash_all", boolean()), @@ -2763,7 +2787,7 @@ TEST(GroupBy, OneScalar) { ASSERT_OK_AND_ASSIGN( Datum actual, GroupByUsingExecPlan( input, {"key"}, {{"hash_one", nullptr, "argument", "hash_one"}}, - use_threads, default_exec_context())); + use_threads)); const auto& struct_arr = actual.array_as(); // Check the key column diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc index beff3436100..f553708cca5 100644 --- a/cpp/src/arrow/compute/kernels/row_encoder.cc +++ b/cpp/src/arrow/compute/kernels/row_encoder.cc @@ -19,7 +19,8 @@ #include "arrow/util/bitmap_writer.h" #include "arrow/util/logging.h" -#include "arrow/util/make_unique.h" + +#include namespace arrow { @@ -145,7 +146,7 @@ Status FixedWidthKeyEncoder::Encode(const ExecValue& data, int64_t batch_length, viewed.type = view_ty.get(); VisitArraySpanInline( viewed, - [&](util::string_view bytes) { + [&](std::string_view bytes) { auto& encoded_ptr = *encoded_bytes++; *encoded_ptr++ = kValidByte; memcpy(encoded_ptr, bytes.data(), byte_width_); @@ -160,7 +161,7 @@ Status FixedWidthKeyEncoder::Encode(const ExecValue& data, int64_t batch_length, } else { const auto& scalar = data.scalar_as(); if (scalar.is_valid) { - const util::string_view data = scalar.view(); + const std::string_view data = scalar.view(); DCHECK_EQ(data.size(), static_cast(byte_width_)); for (int64_t i = 0; i < batch_length; i++) { auto& encoded_ptr = *encoded_bytes++; @@ -257,9 +258,20 @@ Result> DictionaryKeyEncoder::Decode(uint8_t** encode void RowEncoder::Init(const std::vector& column_types, ExecContext* ctx) { ctx_ = ctx; encoders_.resize(column_types.size()); + extension_types_.resize(column_types.size()); for (size_t i = 0; i < column_types.size(); ++i) { - const TypeHolder& type = column_types[i]; + const bool is_extension = column_types[i].id() == Type::EXTENSION; + const TypeHolder& type = is_extension + ? arrow::internal::checked_pointer_cast( + column_types[i].GetSharedPtr()) + ->storage_type() + : column_types[i]; + + if (is_extension) { + extension_types_[i] = arrow::internal::checked_pointer_cast( + column_types[i].GetSharedPtr()); + } if (type.id() == Type::BOOL) { encoders_[i] = std::make_shared(); continue; @@ -354,9 +366,16 @@ Result RowEncoder::Decode(int64_t num_rows, const int32_t* row_ids) { out.values.resize(encoders_.size()); for (size_t i = 0; i < encoders_.size(); ++i) { ARROW_ASSIGN_OR_RAISE( - out.values[i], + auto column_array_data, encoders_[i]->Decode(buf_ptrs.data(), static_cast(num_rows), ctx_->memory_pool())); + + if (extension_types_[i] != nullptr) { + ARROW_ASSIGN_OR_RAISE(out.values[i], ::arrow::internal::GetArrayView( + column_array_data, extension_types_[i])) + } else { + out.values[i] = column_array_data; + } } return out; diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h index 57240172488..5fe80e0f506 100644 --- a/cpp/src/arrow/compute/kernels/row_encoder.h +++ b/cpp/src/arrow/compute/kernels/row_encoder.h @@ -121,7 +121,7 @@ struct VarLengthKeyEncoder : KeyEncoder { int64_t i = 0; VisitArraySpanInline( data.array, - [&](util::string_view bytes) { + [&](std::string_view bytes) { lengths[i++] += kExtraByteForNull + sizeof(Offset) + static_cast(bytes.size()); }, @@ -146,7 +146,7 @@ struct VarLengthKeyEncoder : KeyEncoder { if (data.is_array()) { VisitArraySpanInline( data.array, - [&](util::string_view bytes) { + [&](std::string_view bytes) { auto& encoded_ptr = *encoded_bytes++; *encoded_ptr++ = kValidByte; util::SafeStore(encoded_ptr, static_cast(bytes.size())); @@ -280,6 +280,7 @@ class ARROW_EXPORT RowEncoder { std::vector offsets_; std::vector bytes_; std::vector encoded_nulls_; + std::vector> extension_types_; }; } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 984c3b56538..4de7755ef07 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -56,32 +56,6 @@ using applicator::ScalarUnaryNotNullStateful; namespace { -// Convenience visitor to detect if a numeric Scalar is positive. -struct IsPositiveVisitor { - bool result = false; - - template - Status Visit(const NumericScalar& scalar) { - result = scalar.value > 0; - return Status::OK(); - } - template - Status Visit(const DecimalScalar& scalar) { - result = scalar.value > 0; - return Status::OK(); - } - Status Visit(const Scalar& scalar) { return Status::OK(); } -}; - -bool IsPositive(const Scalar& scalar) { - IsPositiveVisitor visitor{}; - std::ignore = VisitScalarInline(scalar, &visitor); - return visitor.result; -} - -// N.B. take care not to conflict with type_traits.h as that can cause surprises in a -// unity build - // Bitwise operations struct BitWiseNot { @@ -452,556 +426,6 @@ struct LogbChecked { } }; -struct RoundUtil { - // Calculate powers of ten with arbitrary integer exponent - template - static enable_if_floating_value Pow10(int64_t power) { - static constexpr T lut[] = {1e0F, 1e1F, 1e2F, 1e3F, 1e4F, 1e5F, 1e6F, 1e7F, - 1e8F, 1e9F, 1e10F, 1e11F, 1e12F, 1e13F, 1e14F, 1e15F}; - int64_t lut_size = (sizeof(lut) / sizeof(*lut)); - int64_t abs_power = std::abs(power); - auto pow10 = lut[std::min(abs_power, lut_size - 1)]; - while (abs_power-- >= lut_size) { - pow10 *= 1e1F; - } - return (power >= 0) ? pow10 : (1 / pow10); - } -}; - -// Specializations of rounding implementations for round kernels -template -struct RoundImpl; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::floor(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - (*val) -= remainder; - if (remainder.Sign() < 0) { - (*val) -= pow10; - } - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::ceil(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - (*val) -= remainder; - if (remainder.Sign() > 0 && remainder != 0) { - (*val) += pow10; - } - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::trunc(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - (*val) -= remainder; - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::signbit(val) ? std::floor(val) : std::ceil(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - (*val) -= remainder; - if (remainder.Sign() < 0) { - (*val) -= pow10; - } else if (remainder.Sign() > 0 && remainder != 0) { - (*val) += pow10; - } - } -}; - -// NOTE: RoundImpl variants for the HALF_* rounding modes are only -// invoked when the fractional part is equal to 0.5 (std::round is invoked -// otherwise). - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return RoundImpl::Round(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - RoundImpl::Round(val, remainder, pow10, scale); - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return RoundImpl::Round(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - RoundImpl::Round(val, remainder, pow10, scale); - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return RoundImpl::Round(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - RoundImpl::Round(val, remainder, pow10, scale); - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return RoundImpl::Round(val); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - RoundImpl::Round(val, remainder, pow10, scale); - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::round(val * T(0.5)) * 2; - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - auto scaled = val->ReduceScaleBy(scale, /*round=*/false); - if (scaled.low_bits() % 2 != 0) { - scaled += remainder.Sign() >= 0 ? 1 : -1; - } - *val = scaled.IncreaseScaleBy(scale); - } -}; - -template -struct RoundImpl { - template - static constexpr enable_if_floating_value Round(const T val) { - return std::floor(val * T(0.5)) + std::ceil(val * T(0.5)); - } - - template - static enable_if_decimal_value Round(T* val, const T& remainder, - const T& pow10, const int32_t scale) { - auto scaled = val->ReduceScaleBy(scale, /*round=*/false); - if (scaled.low_bits() % 2 == 0) { - scaled += remainder.Sign() ? 1 : -1; - } - *val = scaled.IncreaseScaleBy(scale); - } -}; - -// Specializations of kernel state for round kernels -template -struct RoundOptionsWrapper; - -template <> -struct RoundOptionsWrapper : public OptionsWrapper { - using OptionsType = RoundOptions; - double pow10; - - explicit RoundOptionsWrapper(OptionsType options) : OptionsWrapper(std::move(options)) { - // Only positive exponents for powers of 10 are used because combining - // multiply and division operations produced more stable rounding than - // using multiply-only. Refer to NumPy's round implementation: - // https://github.com/numpy/numpy/blob/7b2f20b406d27364c812f7a81a9c901afbd3600c/numpy/core/src/multiarray/calculation.c#L589 - pow10 = RoundUtil::Pow10(std::abs(options.ndigits)); - } - - static Result> Init(KernelContext* ctx, - const KernelInitArgs& args) { - if (auto options = static_cast(args.options)) { - return ::arrow::internal::make_unique(*options); - } - return Status::Invalid( - "Attempted to initialize KernelState from null FunctionOptions"); - } -}; - -template <> -struct RoundOptionsWrapper - : public OptionsWrapper { - using OptionsType = RoundToMultipleOptions; - using OptionsWrapper::OptionsWrapper; - - static Result> Init(KernelContext* ctx, - const KernelInitArgs& args) { - auto options = static_cast(args.options); - if (!options) { - return Status::Invalid( - "Attempted to initialize KernelState from null FunctionOptions"); - } - - const auto& multiple = options->multiple; - if (!multiple || !multiple->is_valid) { - return Status::Invalid("Rounding multiple must be non-null and valid"); - } - - if (!IsPositive(*multiple)) { - return Status::Invalid("Rounding multiple must be positive"); - } - - // Ensure the rounding multiple option matches the kernel's output type. - // The output type is not available here so we use the following rule: - // If `multiple` is neither a floating-point nor a decimal type, then - // cast to float64, else cast to the kernel's input type. - std::shared_ptr to_type = - (!is_floating(multiple->type->id()) && !is_decimal(multiple->type->id())) - ? float64() - : args.inputs[0].GetSharedPtr(); - if (!multiple->type->Equals(to_type)) { - ARROW_ASSIGN_OR_RAISE( - auto casted_multiple, - Cast(Datum(multiple), to_type, CastOptions::Safe(), ctx->exec_context())); - - // Create a new option object if the rounding multiple was casted. - auto new_options = OptionsType(casted_multiple.scalar(), options->round_mode); - return ::arrow::internal::make_unique(new_options); - } - - return ::arrow::internal::make_unique(*options); - } -}; - -template -struct Round { - using CType = typename TypeTraits::CType; - using State = RoundOptionsWrapper; - - CType pow10; - int64_t ndigits; - - explicit Round(const State& state, const DataType& out_ty) - : pow10(static_cast(state.pow10)), ndigits(state.options.ndigits) {} - - template ::CType> - enable_if_floating_value Call(KernelContext* ctx, CType arg, Status* st) const { - // Do not process Inf or NaN because they will trigger the overflow error at end of - // function. - if (!std::isfinite(arg)) { - return arg; - } - auto round_val = ndigits >= 0 ? (arg * pow10) : (arg / pow10); - auto frac = round_val - std::floor(round_val); - if (frac != T(0)) { - // Use std::round() if in tie-breaking mode and scaled value is not 0.5. - if ((RndMode >= RoundMode::HALF_DOWN) && (frac != T(0.5))) { - round_val = std::round(round_val); - } else { - round_val = RoundImpl::Round(round_val); - } - // Equality check is ommitted so that the common case of 10^0 (integer rounding) - // uses multiply-only - round_val = ndigits > 0 ? (round_val / pow10) : (round_val * pow10); - if (!std::isfinite(round_val)) { - *st = Status::Invalid("overflow occurred during rounding"); - return arg; - } - } else { - // If scaled value is an integer, then no rounding is needed. - round_val = arg; - } - return round_val; - } -}; - -template -struct Round> { - using CType = typename TypeTraits::CType; - using State = RoundOptionsWrapper; - - const ArrowType& ty; - int64_t ndigits; - int32_t pow; - // pow10 is "1" for the given decimal scale. Similarly half_pow10 is "0.5". - CType pow10, half_pow10, neg_half_pow10; - - explicit Round(const State& state, const DataType& out_ty) - : Round(state.options.ndigits, out_ty) {} - - explicit Round(int64_t ndigits, const DataType& out_ty) - : ty(checked_cast(out_ty)), - ndigits(ndigits), - pow(static_cast(ty.scale() - ndigits)) { - if (pow >= ty.precision() || pow < 0) { - pow10 = half_pow10 = neg_half_pow10 = 0; - } else { - pow10 = CType::GetScaleMultiplier(pow); - half_pow10 = CType::GetHalfScaleMultiplier(pow); - neg_half_pow10 = -half_pow10; - } - } - - template ::CType> - enable_if_decimal_value Call(KernelContext* ctx, CType arg, Status* st) const { - if (pow >= ty.precision()) { - *st = Status::Invalid("Rounding to ", ndigits, - " digits will not fit in precision of ", ty); - return 0; - } else if (pow < 0) { - // no-op, copy output to input - return arg; - } - - std::pair pair; - *st = arg.Divide(pow10).Value(&pair); - if (!st->ok()) return arg; - // The remainder is effectively the scaled fractional part after division. - const auto& remainder = pair.second; - if (remainder == 0) return arg; - if (kRoundMode >= RoundMode::HALF_DOWN) { - if (remainder == half_pow10 || remainder == neg_half_pow10) { - // On the halfway point, use tiebreaker - RoundImpl::Round(&arg, remainder, pow10, pow); - } else if (remainder.Sign() >= 0) { - // Positive, round up/down - arg -= remainder; - if (remainder > half_pow10) { - arg += pow10; - } - } else { - // Negative, round up/down - arg -= remainder; - if (remainder < neg_half_pow10) { - arg -= pow10; - } - } - } else { - RoundImpl::Round(&arg, remainder, pow10, pow); - } - if (!arg.FitsInPrecision(ty.precision())) { - *st = Status::Invalid("Rounded value ", arg.ToString(ty.scale()), - " does not fit in precision of ", ty); - return 0; - } - return arg; - } -}; - -template -Status FixedRoundDecimalExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - using Op = Round; - return ScalarUnaryNotNullStateful( - Op(kDigits, *out->type())) - .Exec(ctx, batch, out); -} - -template -struct RoundToMultiple { - using CType = typename TypeTraits::CType; - using State = RoundOptionsWrapper; - - CType multiple; - - explicit RoundToMultiple(const State& state, const DataType& out_ty) - : multiple(UnboxScalar::Unbox(*state.options.multiple)) { - const auto& options = state.options; - DCHECK(options.multiple); - DCHECK(options.multiple->is_valid); - DCHECK(is_floating(options.multiple->type->id())); - } - - template ::CType> - enable_if_floating_value Call(KernelContext* ctx, CType arg, Status* st) const { - // Do not process Inf or NaN because they will trigger the overflow error at end of - // function. - if (!std::isfinite(arg)) { - return arg; - } - auto round_val = arg / multiple; - auto frac = round_val - std::floor(round_val); - if (frac != T(0)) { - // Use std::round() if in tie-breaking mode and scaled value is not 0.5. - if ((kRoundMode >= RoundMode::HALF_DOWN) && (frac != T(0.5))) { - round_val = std::round(round_val); - } else { - round_val = RoundImpl::Round(round_val); - } - round_val *= multiple; - if (!std::isfinite(round_val)) { - *st = Status::Invalid("overflow occurred during rounding"); - return arg; - } - } else { - // If scaled value is an integer, then no rounding is needed. - round_val = arg; - } - return round_val; - } -}; - -template -struct RoundToMultiple> { - using CType = typename TypeTraits::CType; - using State = RoundOptionsWrapper; - - const ArrowType& ty; - CType multiple, half_multiple, neg_half_multiple; - bool has_halfway_point; - - explicit RoundToMultiple(const State& state, const DataType& out_ty) - : ty(checked_cast(out_ty)), - multiple(UnboxScalar::Unbox(*state.options.multiple)), - half_multiple(multiple / 2), - neg_half_multiple(-half_multiple), - has_halfway_point(multiple.low_bits() % 2 == 0) { - const auto& options = state.options; - DCHECK(options.multiple); - DCHECK(options.multiple->is_valid); - DCHECK(options.multiple->type->Equals(out_ty)); - } - - template ::CType> - enable_if_decimal_value Call(KernelContext* ctx, CType arg, Status* st) const { - std::pair pair; - *st = arg.Divide(multiple).Value(&pair); - if (!st->ok()) return arg; - const auto& remainder = pair.second; - if (remainder == 0) return arg; - if (kRoundMode >= RoundMode::HALF_DOWN) { - if (has_halfway_point && - (remainder == half_multiple || remainder == neg_half_multiple)) { - // On the halfway point, use tiebreaker - // Manually implement rounding since we're not actually rounding a - // decimal value, but rather manipulating the multiple - switch (kRoundMode) { - case RoundMode::HALF_DOWN: - if (remainder.Sign() < 0) pair.first -= 1; - break; - case RoundMode::HALF_UP: - if (remainder.Sign() >= 0) pair.first += 1; - break; - case RoundMode::HALF_TOWARDS_ZERO: - // Do nothing - break; - case RoundMode::HALF_TOWARDS_INFINITY: - pair.first += remainder.Sign() >= 0 ? 1 : -1; - break; - case RoundMode::HALF_TO_EVEN: - if (pair.first.low_bits() % 2 != 0) { - pair.first += remainder.Sign() >= 0 ? 1 : -1; - } - break; - case RoundMode::HALF_TO_ODD: - if (pair.first.low_bits() % 2 == 0) { - pair.first += remainder.Sign() >= 0 ? 1 : -1; - } - break; - default: - DCHECK(false); - } - } else if (remainder.Sign() >= 0) { - // Positive, round up/down - if (remainder > half_multiple) { - pair.first += 1; - } - } else { - // Negative, round up/down - if (remainder < neg_half_multiple) { - pair.first -= 1; - } - } - } else { - // Manually implement rounding since we're not actually rounding a - // decimal value, but rather manipulating the multiple - switch (kRoundMode) { - case RoundMode::DOWN: - if (remainder.Sign() < 0) pair.first -= 1; - break; - case RoundMode::UP: - if (remainder.Sign() >= 0) pair.first += 1; - break; - case RoundMode::TOWARDS_ZERO: - // Do nothing - break; - case RoundMode::TOWARDS_INFINITY: - pair.first += remainder.Sign() >= 0 ? 1 : -1; - break; - default: - DCHECK(false); - } - } - CType round_val = pair.first * multiple; - if (!round_val.FitsInPrecision(ty.precision())) { - *st = Status::Invalid("Rounded value ", round_val.ToString(ty.scale()), - " does not fit in precision of ", ty); - return 0; - } - return round_val; - } -}; - -struct Floor { - template - static constexpr enable_if_floating_value Call(KernelContext*, Arg arg, - Status*) { - static_assert(std::is_same::value, ""); - return RoundImpl::Round(arg); - } -}; - -struct Ceil { - template - static constexpr enable_if_floating_value Call(KernelContext*, Arg arg, - Status*) { - static_assert(std::is_same::value, ""); - return RoundImpl::Round(arg); - } -}; - -struct Trunc { - template - static constexpr enable_if_floating_value Call(KernelContext*, Arg arg, - Status*) { - static_assert(std::is_same::value, ""); - return RoundImpl::Round(arg); - } -}; - // Generate a kernel given a bitwise arithmetic functor. Assumes the // functor treats all integer types of equal width identically template