diff --git a/.dockerignore b/.dockerignore index 083905c7439..eb71138c679 100644 --- a/.dockerignore +++ b/.dockerignore @@ -55,6 +55,8 @@ !rust/arrow-flight/Cargo.toml !rust/parquet/Cargo.toml !rust/parquet/build.rs +!rust/parquet_derive/Cargo.toml +!rust/parquet_derive_test/Cargo.toml !rust/datafusion/Cargo.toml !rust/datafusion/benches !rust/integration-testing/Cargo.toml diff --git a/.env b/.env index 62f5ae52b71..5c6aaa5182a 100644 --- a/.env +++ b/.env @@ -24,13 +24,13 @@ ARCH=amd64 CUDA=9.1 DEBIAN=10 UBUNTU=18.04 -FEDORA=32 +FEDORA=33 PYTHON=3.6 -LLVM=10 +LLVM=11 CLANG_TOOLS=8 RUST=nightly-2020-04-22 GO=1.12 -NODE=11 +NODE=14 MAVEN=3.5.4 JDK=8 PANDAS=latest diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 4eda469f49f..b230bce82e1 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -42,9 +42,10 @@ jobs: uses: actions/checkout@v2 with: fetch-depth: 0 - - name: Fetch Submodules and Tags + - name: Git Fixup + if: ${{ github.event_name == 'pull_request' }} shell: bash - run: ci/scripts/util_checkout.sh + run: git branch master origin/master - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Setup Python diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 340d6bc4719..b55da96beb2 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -69,7 +69,7 @@ jobs: git remote add upstream https://github.com/apache/arrow git fetch upstream changed() { - git diff --name-only HEAD^..upstream/master | grep -e "$1" >/dev/null 2>&1 + git diff --name-only HEAD..upstream/master | grep -e "$1" >/dev/null 2>&1 } if changed '^r/.*\.R$'; then echo "::set-env name=R_DOCS::true" @@ -84,10 +84,11 @@ jobs: echo "::set-env name=CLANG_FORMAT_R::true" fi - name: Run cmake_format - if: false - # TODO: make this work https://issues.apache.org/jira/browse/ARROW-8489 - # if: env.CMAKE_FORMAT == 'true' || endsWith(github.event.comment.body, 'everything') + if: env.CMAKE_FORMAT == 'true' || endsWith(github.event.comment.body, 'everything') run: | + set -ex + export PATH=/home/runner/.local/bin:$PATH + python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install -r dev/archery/requirements-lint.txt python3 run-cmake-format.py - name: Run clang-format on cpp diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 07cd4a91914..4e3e27016c1 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -102,9 +102,11 @@ jobs: # hosted machines name: ${{ matrix.title }} runs-on: ${{ matrix.runner }} - if: github.event_name == 'push' + # TODO(kszucs): re-enable once the self-hosted workers are properly + # registered to github + if: false && github.event_name == 'push' defaults: - # to use certain environment variables are set by .bashrc an interactive + # To use certain environment variables set by .bashrc, an interactive # bash shell must be used run: shell: bash -i {0} @@ -119,11 +121,12 @@ jobs: debian: 10 title: ARM32v7 Debian 10 C++ image: | - -e CPP_MAKE_PARALLELISM=4 \ + -e CPP_MAKE_PARALLELISM=2 \ -e CXXFLAGS=-Wno-psabi \ -e ARROW_PARQUET=OFF \ -e ARROW_FLIGHT=OFF \ -e ARROW_GANDIVA=OFF \ + -e ARROW_ORC=OFF \ -e CMAKE_ARGS=-DARROW_CPU_FLAG=armv7 \ debian-cpp arch: 'arm32v7' @@ -186,23 +189,24 @@ jobs: strategy: fail-fast: false env: + ARROW_BUILD_TESTS: ON + ARROW_DATASET: ON + ARROW_FLIGHT: ON + ARROW_GANDIVA: ON + ARROW_HDFS: ON ARROW_HOME: /usr/local ARROW_JEMALLOC: ON # TODO(kszucs): link error in the tests - ARROW_DATASET: ON ARROW_ORC: OFF - ARROW_FLIGHT: ON - ARROW_HDFS: ON - ARROW_PLASMA: ON - ARROW_GANDIVA: ON ARROW_PARQUET: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_LZ4: ON + ARROW_PLASMA: ON + ARROW_S3: ON + ARROW_WITH_BROTLI: ON ARROW_WITH_BZ2: ON - ARROW_WITH_ZSTD: ON + ARROW_WITH_LZ4: ON ARROW_WITH_SNAPPY: ON - ARROW_WITH_BROTLI: ON - ARROW_BUILD_TESTS: ON + ARROW_WITH_ZLIB: ON + ARROW_WITH_ZSTD: ON steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -319,6 +323,7 @@ jobs: ARROW_JEMALLOC: OFF ARROW_PARQUET: ON ARROW_PYTHON: ON + ARROW_S3: ON ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: ON @@ -333,7 +338,6 @@ jobs: CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/mingw${{ matrix.mingw-n-bits }} -DBoost_NO_BOOST_CMAKE=ON - CMAKE_GENERATOR: MSYS Makefiles CMAKE_UNITY_BUILD: ON steps: - name: Disable Crash Dialogs @@ -370,6 +374,14 @@ jobs: run: | export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build" + - name: Download MinIO + shell: msys2 {0} + run: | + mkdir -p /usr/local/bin + wget \ + --output-document /usr/local/bin/minio.exe \ + https://dl.min.io/server/minio/release/windows-amd64/minio.exe + chmod +x /usr/local/bin/minio.exe - name: Test shell: msys2 {0} run: | diff --git a/.github/workflows/cpp_cron.yml b/.github/workflows/cpp_cron.yml index 95c5f4b7cbb..5f404b22fe5 100644 --- a/.github/workflows/cpp_cron.yml +++ b/.github/workflows/cpp_cron.yml @@ -46,7 +46,7 @@ jobs: matrix: name: - amd64-debian-10-cpp - - amd64-fedora-32-cpp + - amd64-fedora-33-cpp - amd64-ubuntu-16.04-cpp - amd64-ubuntu-18.04-cpp - amd64-ubuntu-18.04-cpp-cmake32 @@ -55,10 +55,10 @@ jobs: image: debian-cpp title: AMD64 Debian 10 C++ debian: 10 - - name: amd64-fedora-32-cpp + - name: amd64-fedora-33-cpp image: fedora-cpp - title: AMD64 Fedora 32 C++ - fedora: 32 + title: AMD64 Fedora 33 C++ + fedora: 33 - name: amd64-ubuntu-16.04-cpp image: ubuntu-cpp title: AMD64 Ubuntu 16.04 C++ @@ -75,7 +75,7 @@ jobs: # the defaults here should correspond to the values in .env ARCH: 'amd64' DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 32 }} + FEDORA: ${{ matrix.fedora || 33 }} UBUNTU: ${{ matrix.ubuntu || 18.04 }} steps: - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 59e82021722..c7f5821394d 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -59,43 +59,6 @@ jobs: continue-on-error: true run: archery docker push ubuntu-lint - docs: - name: Sphinx and API documentations - runs-on: ubuntu-latest - if: github.event_name == 'push' - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - shell: bash - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v1 - with: - path: .docker - key: ubuntu-18.04-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-18.04- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ubuntu-docs - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ubuntu-docs - release: name: Source Release and Merge Script runs-on: ubuntu-latest diff --git a/.github/workflows/dev_labeler.yml b/.github/workflows/dev_labeler.yml new file mode 100644 index 00000000000..f5c3a348567 --- /dev/null +++ b/.github/workflows/dev_labeler.yml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: PR labeler +on: + pull_request_target: + types: [opened, reopened] + paths: + - 'rust/**' + +jobs: + assign-rust-labels: + runs-on: ubuntu-latest + steps: + - name: Assign Github labels + uses: actions/labeler@2.2.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + configuration-path: .github/workflows/dev_labeler/labeler.yml + sync-labels: true diff --git a/.github/workflows/dev_labeler/labeler.yml b/.github/workflows/dev_labeler/labeler.yml new file mode 100644 index 00000000000..1753caa733f --- /dev/null +++ b/.github/workflows/dev_labeler/labeler.yml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +lang-rust: + - rust/**/* + +datafusion: + - rust/datafusion/**/* diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index a6796225e40..385d0210120 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -83,7 +83,7 @@ jobs: - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true - run: archery docker push debian-go + run: archery docker push debian-java macos: name: AMD64 MacOS 10.15 Java JDK ${{ matrix.jdk }} diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index b12c2290c61..e84a812a078 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -40,7 +40,7 @@ env: jobs: docker: - name: AMD64 Debian 10 NodeJS 11 + name: AMD64 Debian 10 NodeJS 14 runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: @@ -75,7 +75,7 @@ jobs: strategy: fail-fast: false matrix: - node: [11] + node: [14] steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -103,7 +103,7 @@ jobs: # strategy: # fail-fast: false # matrix: - # node: [11] + # node: [14] # steps: # - name: Checkout Arrow # uses: actions/checkout@v1 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 84c5bb601ee..6b84ed2cdb8 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -50,7 +50,7 @@ jobs: - ubuntu-16.04-python-3 - conda-python-3.8-nopandas - conda-python-3.6-pandas-0.23 - - conda-python-3.6-pandas-latest + - conda-python-3.7-pandas-latest - centos-python-3.6-manylinux1 include: - name: ubuntu-16.04-python-3 @@ -70,11 +70,11 @@ jobs: title: AMD64 Conda Python 3.6 Pandas 0.23 python: 3.6 pandas: 0.23 - - name: conda-python-3.6-pandas-latest - cache: conda-python-3.6 + - name: conda-python-3.7-pandas-latest + cache: conda-python-3.7 image: conda-python-pandas - title: AMD64 Conda Python 3.6 Pandas latest - python: 3.6 + title: AMD64 Conda Python 3.7 Pandas latest + python: 3.7 pandas: latest - name: centos-python-3.6-manylinux1 cache: manylinux1 @@ -116,7 +116,7 @@ jobs: run: archery docker push ${{ matrix.image }} macos: - name: AMD64 MacOS 10.15 Python 3.7 + name: AMD64 MacOS 10.15 Python 3 runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} env: @@ -150,8 +150,9 @@ jobs: brew update --preinstall brew bundle --file=cpp/Brewfile brew install coreutils python - pip3 install -r python/requirements-build.txt \ - -r python/requirements-test.txt + python3 -mpip install \ + -r python/requirements-build.txt \ + -r python/requirements-test.txt - name: Build shell: bash run: | diff --git a/.github/workflows/python_cron.yml b/.github/workflows/python_cron.yml index 133a73b59bf..256856bd197 100644 --- a/.github/workflows/python_cron.yml +++ b/.github/workflows/python_cron.yml @@ -45,7 +45,7 @@ jobs: matrix: name: - debian-10-python-3 - - fedora-32-python-3 + - fedora-33-python-3 - ubuntu-18.04-python-3 - conda-python-3.7-dask-latest - conda-python-3.7-turbodbc-latest @@ -59,11 +59,11 @@ jobs: image: debian-python title: AMD64 Debian 10 Python 3 debian: 10 - - name: fedora-32-python-3 - cache: fedora-32-python-3 + - name: fedora-33-python-3 + cache: fedora-33-python-3 image: fedora-python - title: AMD64 Fedora 32 Python 3 - fedora: 32 + title: AMD64 Fedora 33 Python 3 + fedora: 33 - name: ubuntu-18.04-python-3 cache: ubuntu-18.04-python-3 image: ubuntu-python @@ -102,7 +102,7 @@ jobs: env: # the defaults here should correspond to the values in .env DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 32 }} + FEDORA: ${{ matrix.fedora || 33 }} UBUNTU: ${{ matrix.ubuntu || 18.04 }} PYTHON: ${{ matrix.python || 3.7 }} HDFS: ${{ matrix.hdfs || '2.9.2' }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 29ffd444370..37aee196883 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -53,7 +53,7 @@ jobs: strategy: fail-fast: false matrix: - r: ["3.6", "4.0"] + r: ["3.6"] ubuntu: [18.04] env: R: ${{ matrix.r }} @@ -92,21 +92,20 @@ jobs: continue-on-error: true run: archery docker push ubuntu-r - rstudio: - name: "rstudio/r-base:${{ matrix.r_version }}-${{ matrix.r_image }}" + bundled: + name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}" runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} strategy: fail-fast: false matrix: - # See https://hub.docker.com/r/rstudio/r-base - r_version: ["3.6", "4.0"] - r_image: - - centos7 + config: + - {org: 'rstudio', image: 'r-base', tag: '4.0-centos7'} + - {org: 'rhub', image: 'debian-gcc-devel', tag: 'latest'} env: - R_ORG: rstudio - R_IMAGE: r-base - R_TAG: ${{ matrix.r_version }}-${{ matrix.r_image }} + R_ORG: ${{ matrix.config.org }} + R_IMAGE: ${{ matrix.config.image }} + R_TAG: ${{ matrix.config.tag }} steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -120,8 +119,8 @@ jobs: uses: actions/cache@v1 with: path: .docker - key: ${{ matrix.r_image }}-r-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.r_image }}-r- + key: ${{ matrix.config.image }}-r-${{ hashFiles('cpp/**') }} + restore-keys: ${{ matrix.config.image }}-r- - name: Setup Python uses: actions/setup-python@v1 with: diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 783ca91cb82..0558ac8a2cf 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -174,6 +174,7 @@ jobs: ARROW_JEMALLOC: OFF ARROW_PARQUET: ON ARROW_PYTHON: OFF + ARROW_S3: ON ARROW_USE_GLOG: OFF ARROW_WITH_BROTLI: ON ARROW_WITH_BZ2: ON @@ -187,7 +188,6 @@ jobs: CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/mingw${{ matrix.mingw-n-bits }} -DBoost_NO_BOOST_CMAKE=ON - CMAKE_GENERATOR: MSYS Makefiles CMAKE_UNITY_BUILD: ON steps: - name: Disable Crash Dialogs diff --git a/.github/workflows/rust_cron.yml b/.github/workflows/rust_cron.yml new file mode 100644 index 00000000000..378f2dd1081 --- /dev/null +++ b/.github/workflows/rust_cron.yml @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Rust Cron + +on: + push: + paths: + - '.github/workflows/rust_cron.yml' + pull_request: + paths: + - '.github/workflows/rust_cron.yml' + schedule: + - cron: 0 */12 * * * + +jobs: + coverage: + name: AMD64 Debian 10 Rust ${{ matrix.rust }} Coverage + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} + strategy: + fail-fast: false + matrix: + rust: [nightly-2020-04-22] + env: + RUST: ${{ matrix.rust }} + steps: + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Fetch Submodules and Tags + run: ci/scripts/util_checkout.sh + - name: Run coverage + shell: bash + run: | + echo ${RUST} > rust/rust-toolchain && + ci/scripts/rust_coverage.sh `pwd` `pwd`/build $RUST + - name: Report coverage + continue-on-error: true + shell: bash + run: bash <(curl -s https://codecov.io/bash) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d72b126ef34..e70eaceaf41 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,7 +55,7 @@ repos: hooks: - id: flake8 name: Python Format - files: ^(python|crossbow|integration)/ + files: ^(python|dev|integration)/ types: - file - python diff --git a/.travis.yml b/.travis.yml index fa5d84c82c7..84bbbce23db 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,22 +40,81 @@ jobs: ARCH: arm64v8 ARROW_CI_MODULES: "CPP" DOCKER_IMAGE_ID: ubuntu-cpp + # ARROW_USE_GLOG=OFF is needed to avoid build error caused by + # glog and CMAKE_UNITY_BUILD=ON. + # + # Disable ARROW_S3 because it often causes "No output has + # been received in the last 10m0s, this potentially indicates + # a stalled build or something wrong with the build itself." + # on Travis CI. + DOCKER_RUN_ARGS: >- + " + -e ARROW_BUILD_STATIC=OFF + -e ARROW_ORC=OFF + -e ARROW_S3=OFF + -e ARROW_USE_GLOG=OFF + -e CMAKE_UNITY_BUILD=ON + " + # We need to use smaller build when cache doesn't exist + # because Travis CI has "No output has been received in the + # last 10m0s" limitation. If we build many modules, we reach + # the limitation. + DOCKER_RUN_ARGS_NO_CACHE: >- + " + -e ARROW_BUILD_TESTS=OFF + -e ARROW_GANDIVA=OFF + -e ARROW_PARQUET=OFF + " + # The LLVM's APT repository provides only arm64 binaries. + # We should use LLVM provided by Ubuntu. + LLVM: "10" UBUNTU: "20.04" + - name: "C++ on s390x" os: linux arch: s390x env: ARCH: s390x ARROW_CI_MODULES: "CPP" - ARROW_FLIGHT: "ON" - ARROW_PARQUET: "OFF" DOCKER_IMAGE_ID: ubuntu-cpp - PARQUET_BUILD_EXAMPLES: "OFF" - PARQUET_BUILD_EXECUTABLES: "OFF" - Protobuf_SOURCE: "BUNDLED" + # Can't use CMAKE_UNITIFY_BUILD=ON because of compiler crash. + # Can't enable ARROW_S3 because compiler is killed while compiling + # aws-sdk-cpp. + DOCKER_RUN_ARGS: >- + " + -e ARROW_BUILD_STATIC=OFF + -e ARROW_FLIGHT=ON + -e ARROW_ORC=OFF + -e ARROW_PARQUET=OFF + -e ARROW_S3=OFF + -e PARQUET_BUILD_EXAMPLES=OFF + -e PARQUET_BUILD_EXECUTABLES=OFF + -e Protobuf_SOURCE=BUNDLED + -e cares_SOURCE=BUNDLED + -e gRPC_SOURCE=BUNDLED + " + # The LLVM's APT repository provides only arm64 binaries. + # We should use LLVM provided by Ubuntu. + LLVM: "10" UBUNTU: "20.04" - cares_SOURCE: "BUNDLED" - gRPC_SOURCE: "BUNDLED" + + - name: "Go on s390x" + os: linux + arch: s390x + env: + ARCH: s390x + ARROW_CI_MODULES: "GO" + DOCKER_IMAGE_ID: debian-go + + - name: "Java on s390x" + os: linux + arch: s390x + env: + ARCH: s390x + ARROW_CI_MODULES: "JAVA" + DOCKER_IMAGE_ID: debian-java + JDK: 11 + allow_failures: - arch: s390x @@ -87,15 +146,13 @@ script: # /home/travis/.travis/functions: line 109: ulimit: core file size: cannot modify limit: Operation not permitted - | ulimit -c unlimited || : + - | + if [ $(ls $TRAVIS_BUILD_DIR/.docker | wc -l) -eq 0 ]; then + DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} ${DOCKER_RUN_ARGS_NO_CACHE}" + fi - | archery docker run \ - -e ARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -e ARROW_PARQUET=${ARROW_PARQUET:-ON} \ - -e PARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-ON} \ - -e PARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-ON} \ - -e Protobuf_SOURCE=${Protobuf_SOURCE:-} \ - -e cares_SOURCE=${cares_SOURCE:-} \ - -e gRPC_SOURCE=${gRPC_SOURCE:-} \ + ${DOCKER_RUN_ARGS} \ --volume ${PWD}/build:/build \ ${DOCKER_IMAGE_ID} diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e5619b9a33..686f1d4ef5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,588 @@ +# Apache Arrow 2.0.0 (2020-10-13) + +## Bug Fixes + +* [ARROW-2367](https://issues.apache.org/jira/browse/ARROW-2367) - [Python] ListArray has trouble with sizes greater than kMaximumCapacity +* [ARROW-4189](https://issues.apache.org/jira/browse/ARROW-4189) - [CI] [Rust] Fix broken cargo coverage +* [ARROW-4917](https://issues.apache.org/jira/browse/ARROW-4917) - [C++] orc\_ep fails in cpp-alpine docker +* [ARROW-5578](https://issues.apache.org/jira/browse/ARROW-5578) - [C++][Flight] Flight does not build out of the box on Alpine Linux +* [ARROW-7226](https://issues.apache.org/jira/browse/ARROW-7226) - [JSON][Python] Json loader fails on example in documentation. +* [ARROW-7384](https://issues.apache.org/jira/browse/ARROW-7384) - [Website] Fix search indexing warning reported by Google +* [ARROW-7517](https://issues.apache.org/jira/browse/ARROW-7517) - [C++] Builder does not honour dictionary type provided during initialization +* [ARROW-7663](https://issues.apache.org/jira/browse/ARROW-7663) - [Python] from\_pandas gives TypeError instead of ArrowTypeError in some cases +* [ARROW-7903](https://issues.apache.org/jira/browse/ARROW-7903) - [Rust] [DataFusion] Upgrade SQLParser dependency for DataFusion +* [ARROW-7957](https://issues.apache.org/jira/browse/ARROW-7957) - [Python] ParquetDataset cannot take HadoopFileSystem as filesystem +* [ARROW-8265](https://issues.apache.org/jira/browse/ARROW-8265) - [Rust] [DataFusion] Table API collect() should not require context +* [ARROW-8394](https://issues.apache.org/jira/browse/ARROW-8394) - [JS] Typescript compiler errors for arrow d.ts files, when using es2015-esm package +* [ARROW-8735](https://issues.apache.org/jira/browse/ARROW-8735) - [Rust] [Parquet] Parquet crate fails to compile on Arm architecture +* [ARROW-8749](https://issues.apache.org/jira/browse/ARROW-8749) - [C++] IpcFormatWriter writes dictionary batches with wrong ID +* [ARROW-8773](https://issues.apache.org/jira/browse/ARROW-8773) - [Python] pyarrow schema.empty\_table() does not preserve nullability of fields +* [ARROW-9028](https://issues.apache.org/jira/browse/ARROW-9028) - [R] Should be able to convert an empty table +* [ARROW-9096](https://issues.apache.org/jira/browse/ARROW-9096) - [Python] Pandas roundtrip with object-dtype column labels with integer values: data type "integer" not understood +* [ARROW-9177](https://issues.apache.org/jira/browse/ARROW-9177) - [C++][Parquet] Tracking issue for cross-implementation LZ4 Parquet compression compatibility +* [ARROW-9414](https://issues.apache.org/jira/browse/ARROW-9414) - [C++] apt package includes headers for S3 interface, but no support +* [ARROW-9462](https://issues.apache.org/jira/browse/ARROW-9462) - [Go] The Indentation after the first Record arrjson writer is missing +* [ARROW-9463](https://issues.apache.org/jira/browse/ARROW-9463) - [Go] The writer is double closed in TestReadWrite +* [ARROW-9490](https://issues.apache.org/jira/browse/ARROW-9490) - [Python] pyarrow array creation for specific set of numpy scalars fails +* [ARROW-9495](https://issues.apache.org/jira/browse/ARROW-9495) - [C++] Equality assertions don't handle Inf / -Inf properly +* [ARROW-9520](https://issues.apache.org/jira/browse/ARROW-9520) - [Rust] [DataFusion] Can't alias an aggregate expression +* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow +* [ARROW-9532](https://issues.apache.org/jira/browse/ARROW-9532) - [Python] Building pyarrow for MacPorts on macOS +* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe +* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java +* [ARROW-9541](https://issues.apache.org/jira/browse/ARROW-9541) - [C++] CMakeLists requires UTF8PROC\_STATIC when building static library +* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working +* [ARROW-9546](https://issues.apache.org/jira/browse/ARROW-9546) - [Python] Clean up Pandas Metadata Conversion test +* [ARROW-9548](https://issues.apache.org/jira/browse/ARROW-9548) - [Go] Test output files in tmp directory are not removed correctly +* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds +* [ARROW-9554](https://issues.apache.org/jira/browse/ARROW-9554) - [Java] FixedWidthInPlaceVectorSorter sometimes produces wrong result +* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values +* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml +* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change +* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar +* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_' +* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release +* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN +* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0 +* [ARROW-9583](https://issues.apache.org/jira/browse/ARROW-9583) - [Rust] Offset is mishandled in arithmetic and boolean compute kernels +* [ARROW-9588](https://issues.apache.org/jira/browse/ARROW-9588) - [C++] clang/win: Copy constructor of ParquetInvalidOrCorruptedFileException not correctly triggered +* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class +* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle +* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again +* [ARROW-9597](https://issues.apache.org/jira/browse/ARROW-9597) - [C++] AddAlias in compute::FunctionRegistry should be synchronized +* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet] Spaced definition levels is not assigned correctly. +* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers +* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build +* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build +* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build +* [ARROW-9603](https://issues.apache.org/jira/browse/ARROW-9603) - [C++][Parquet] Write Arrow relies on unspecified behavior for nested types +* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels +* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns +* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0 +* [ARROW-9622](https://issues.apache.org/jira/browse/ARROW-9622) - [Java] ComplexCopier fails if a structvector has a child UnionVector with nulls +* [ARROW-9628](https://issues.apache.org/jira/browse/ARROW-9628) - [Rust] Clippy PR test failing intermittently on Rust / AMD64 MacOS +* [ARROW-9629](https://issues.apache.org/jira/browse/ARROW-9629) - [Python] Kartothek integration tests failing due to missing freezegun module +* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight +* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight +* [ARROW-9642](https://issues.apache.org/jira/browse/ARROW-9642) - [C++] Let MakeBuilder refer DictionaryType's index\_type for deciding the starting bit width of the indices +* [ARROW-9643](https://issues.apache.org/jira/browse/ARROW-9643) - [C++] Illegal instruction on haswell cpu +* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path +* [ARROW-9652](https://issues.apache.org/jira/browse/ARROW-9652) - [Rust][DataFusion] Panic trying to select \* from a CSV (panicked at 'index out of bounds: the len is 0 but the index is 0) +* [ARROW-9653](https://issues.apache.org/jira/browse/ARROW-9653) - [Rust][DataFusion] Multi-column Group by: Invalid Argument Error +* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers +* [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps +* [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412 +* [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client +* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz) +* [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning +* [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails +* [ARROW-9696](https://issues.apache.org/jira/browse/ARROW-9696) - [Rust] [Datafusion] nested binary expressions broken +* [ARROW-9698](https://issues.apache.org/jira/browse/ARROW-9698) - [C++] Revert "Add -NDEBUG flag to arrow.pc" +* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos +* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error +* [ARROW-9714](https://issues.apache.org/jira/browse/ARROW-9714) - [Rust] [DataFusion] TypeCoercionRule not implemented for Limit or Sort +* [ARROW-9716](https://issues.apache.org/jira/browse/ARROW-9716) - [Rust] [DataFusion] MergeExec should have concurrency limit +* [ARROW-9726](https://issues.apache.org/jira/browse/ARROW-9726) - [Rust] [DataFusion] ParquetScanExec launches threads too early +* [ARROW-9727](https://issues.apache.org/jira/browse/ARROW-9727) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) +* [ARROW-9729](https://issues.apache.org/jira/browse/ARROW-9729) - [Java] Error Prone causes other annotation processors to not work with Eclipse +* [ARROW-9733](https://issues.apache.org/jira/browse/ARROW-9733) - [Rust][DataFusion] Aggregates COUNT/MIN/MAX don't work on VARCHAR columns +* [ARROW-9734](https://issues.apache.org/jira/browse/ARROW-9734) - [Rust] [DataFusion] TableProvider.scan executing partitions prematurely +* [ARROW-9741](https://issues.apache.org/jira/browse/ARROW-9741) - [Rust] [DataFusion] Incorrect count in TPC-H query 1 result set +* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset +* [ARROW-9744](https://issues.apache.org/jira/browse/ARROW-9744) - [Python] Failed to install on aarch64 +* [ARROW-9764](https://issues.apache.org/jira/browse/ARROW-9764) - [CI][Java] Push wrong Docker image +* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds +* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds +* [ARROW-9778](https://issues.apache.org/jira/browse/ARROW-9778) - [Rust] [DataFusion] Logical and physical schemas' nullability does not match in 8 out of 20 end-to-end tests +* [ARROW-9783](https://issues.apache.org/jira/browse/ARROW-9783) - [Rust] [DataFusion] Logical aggregate expressions require explicit data type +* [ARROW-9785](https://issues.apache.org/jira/browse/ARROW-9785) - [Python] pyarrow/tests/test\_fs.py::test\_s3\_options too slow +* [ARROW-9789](https://issues.apache.org/jira/browse/ARROW-9789) - [C++] Don't install jemalloc in parallel +* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries +* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries +* [ARROW-9793](https://issues.apache.org/jira/browse/ARROW-9793) - [Rust] [DataFusion] Tests failing in master +* [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch +* [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect +* [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns +* [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true +* [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs +* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs +* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs +* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs +* [ARROW-9816](https://issues.apache.org/jira/browse/ARROW-9816) - [C++] Escape quotes in config.h +* [ARROW-9827](https://issues.apache.org/jira/browse/ARROW-9827) - [Python] pandas.read\_parquet fails for wide parquet files and pyarrow 1.0.X +* [ARROW-9831](https://issues.apache.org/jira/browse/ARROW-9831) - [Rust] [DataFusion] Fix compilation error +* [ARROW-9840](https://issues.apache.org/jira/browse/ARROW-9840) - [Python] Python fs documentation out of date with code +* [ARROW-9846](https://issues.apache.org/jira/browse/ARROW-9846) - [Rust] Master branch broken build +* [ARROW-9851](https://issues.apache.org/jira/browse/ARROW-9851) - [C++] Valgrind errors due to unrecognized instructions +* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) +* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) +* [ARROW-9855](https://issues.apache.org/jira/browse/ARROW-9855) - [R] Fix bad merge/Rcpp conflict +* [ARROW-9859](https://issues.apache.org/jira/browse/ARROW-9859) - [C++] S3 FileSystemFromUri with special char in secret key fails +* [ARROW-9864](https://issues.apache.org/jira/browse/ARROW-9864) - [Python] pathlib.Path not supported in write\_to\_dataset with partition columns +* [ARROW-9874](https://issues.apache.org/jira/browse/ARROW-9874) - [C++] NewStreamWriter / NewFileWriter don't own output stream +* [ARROW-9876](https://issues.apache.org/jira/browse/ARROW-9876) - [CI][C++] Travis ARM jobs timeout +* [ARROW-9877](https://issues.apache.org/jira/browse/ARROW-9877) - [C++][CI] homebrew-cpp fails due to avx512 +* [ARROW-9879](https://issues.apache.org/jira/browse/ARROW-9879) - [Python] ChunkedArray.\_\_getitem\_\_ doesn't work with numpy scalars +* [ARROW-9882](https://issues.apache.org/jira/browse/ARROW-9882) - [C++/Python] Update conda-forge-pinning to 3 for OSX conda packages +* [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6 +* [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads +* [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant" +* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern +* [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem) +* [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another +* [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array +* [ARROW-9922](https://issues.apache.org/jira/browse/ARROW-9922) - [Rust] Add \`try\_from(Vec\>)\` to StructArray +* [ARROW-9924](https://issues.apache.org/jira/browse/ARROW-9924) - [Python] Performance regression reading individual Parquet files using Dataset interface +* [ARROW-9931](https://issues.apache.org/jira/browse/ARROW-9931) - [C++] Fix undefined behaviour on invalid IPC (OSS-Fuzz) +* [ARROW-9932](https://issues.apache.org/jira/browse/ARROW-9932) - [R] Arrow 1.0.1 R package fails to install on R3.4 over linux +* [ARROW-9936](https://issues.apache.org/jira/browse/ARROW-9936) - [Python] Fix / test relative file paths in pyarrow.parquet +* [ARROW-9937](https://issues.apache.org/jira/browse/ARROW-9937) - [Rust] [DataFusion] Average is not correct +* [ARROW-9943](https://issues.apache.org/jira/browse/ARROW-9943) - [C++] Arrow metadata not applied recursively when reading Parquet file +* [ARROW-9946](https://issues.apache.org/jira/browse/ARROW-9946) - [R] ParquetFileWriter segfaults when \`sink\` is a string +* [ARROW-9953](https://issues.apache.org/jira/browse/ARROW-9953) - [R] Declare minimum version for bit64 +* [ARROW-9962](https://issues.apache.org/jira/browse/ARROW-9962) - [Python] Conversion to pandas with index column using fixed timezone fails +* [ARROW-9968](https://issues.apache.org/jira/browse/ARROW-9968) - [C++] UBSAN link failure with \_\_int8\_t +* [ARROW-9969](https://issues.apache.org/jira/browse/ARROW-9969) - [C++] RecordBatchBuilder yields invalid result with dictionary fields +* [ARROW-9970](https://issues.apache.org/jira/browse/ARROW-9970) - [Go] checkptr failures in sum methods +* [ARROW-9972](https://issues.apache.org/jira/browse/ARROW-9972) - [CI] Work around grpc-re2 clash on Homebrew +* [ARROW-9973](https://issues.apache.org/jira/browse/ARROW-9973) - [Java] JDBC DateConsumer does not allow dates before epoch +* [ARROW-9976](https://issues.apache.org/jira/browse/ARROW-9976) - [Python] ArrowCapacityError when doing Table.from\_pandas with large dataframe +* [ARROW-9990](https://issues.apache.org/jira/browse/ARROW-9990) - [Rust] [DataFusion] NOT is not plannable +* [ARROW-9993](https://issues.apache.org/jira/browse/ARROW-9993) - [Python] Tzinfo - string roundtrip fails on pytz.StaticTzInfo objects +* [ARROW-9994](https://issues.apache.org/jira/browse/ARROW-9994) - [C++][Python] Auto chunking nested array containing binary-like fields result malformed output +* [ARROW-9996](https://issues.apache.org/jira/browse/ARROW-9996) - [C++] Dictionary is unset when calling DictionaryArray.GetScalar for null values +* [ARROW-10003](https://issues.apache.org/jira/browse/ARROW-10003) - [C++] Create directories in CopyFiles when copying within the same filesystem +* [ARROW-10008](https://issues.apache.org/jira/browse/ARROW-10008) - [Python] pyarrow.parquet.read\_table fails with predicate pushdown on categorical data with use\_legacy\_dataset=False +* [ARROW-10011](https://issues.apache.org/jira/browse/ARROW-10011) - [C++] Make FindRE2.cmake re-entrant +* [ARROW-10012](https://issues.apache.org/jira/browse/ARROW-10012) - [C++] Sporadic failures in CopyFiles test +* [ARROW-10013](https://issues.apache.org/jira/browse/ARROW-10013) - [C++][CI] Flight test failure in TestFlightClient.GenericOptions +* [ARROW-10017](https://issues.apache.org/jira/browse/ARROW-10017) - [Java] LargeMemoryUtil.checkedCastToInt has buggy logic +* [ARROW-10022](https://issues.apache.org/jira/browse/ARROW-10022) - [C++] [Compute] core dumped on some scalar-arithmetic-benchmark +* [ARROW-10027](https://issues.apache.org/jira/browse/ARROW-10027) - [Python] Incorrect null column returned when using a dataset filter expression. +* [ARROW-10034](https://issues.apache.org/jira/browse/ARROW-10034) - [Rust] Master build broken +* [ARROW-10041](https://issues.apache.org/jira/browse/ARROW-10041) - [Rust] Possible to create LargeStringArray with DataType::Utf8 +* [ARROW-10047](https://issues.apache.org/jira/browse/ARROW-10047) - [CI] Conda integration tests failing with cmake error +* [ARROW-10048](https://issues.apache.org/jira/browse/ARROW-10048) - [Rust] Error in aggregate of min/max for strings +* [ARROW-10049](https://issues.apache.org/jira/browse/ARROW-10049) - [C++/Python] Sync conda recipe with conda-forge +* [ARROW-10060](https://issues.apache.org/jira/browse/ARROW-10060) - [Rust] [DataFusion] MergeExec currently discards partitions with errors +* [ARROW-10062](https://issues.apache.org/jira/browse/ARROW-10062) - [Rust]: Fix for null elems for DoubleEndedIter for DictArray +* [ARROW-10073](https://issues.apache.org/jira/browse/ARROW-10073) - [Python] Test test\_parquet\_nested\_storage relies on dict item ordering +* [ARROW-10081](https://issues.apache.org/jira/browse/ARROW-10081) - [C++/Python] Fix bash syntax in drone.io conda builds +* [ARROW-10085](https://issues.apache.org/jira/browse/ARROW-10085) - [C++] S3 tests fail on AppVeyor +* [ARROW-10087](https://issues.apache.org/jira/browse/ARROW-10087) - [CI] Fix nightly docs job +* [ARROW-10098](https://issues.apache.org/jira/browse/ARROW-10098) - [R][Doc] Fix copy\_files doc mismatch +* [ARROW-10104](https://issues.apache.org/jira/browse/ARROW-10104) - [Python] Separate tests into its own conda package +* [ARROW-10114](https://issues.apache.org/jira/browse/ARROW-10114) - [R] Segfault in to\_dataframe\_parallel with deeply nested structs +* [ARROW-10116](https://issues.apache.org/jira/browse/ARROW-10116) - [Python][Packaging] Fix gRPC linking error in macOS wheels builds +* [ARROW-10119](https://issues.apache.org/jira/browse/ARROW-10119) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz) +* [ARROW-10121](https://issues.apache.org/jira/browse/ARROW-10121) - [C++][Python] Variable dictionaries do not survive roundtrip to IPC stream +* [ARROW-10124](https://issues.apache.org/jira/browse/ARROW-10124) - [R] Write functions don't follow umask setting +* [ARROW-10125](https://issues.apache.org/jira/browse/ARROW-10125) - [R] Int64 downcast check doesn't consider all chunks +* [ARROW-10130](https://issues.apache.org/jira/browse/ARROW-10130) - [C++][Dataset] ParquetFileFragment::SplitByRowGroup does not preserve "complete\_metadata" status +* [ARROW-10136](https://issues.apache.org/jira/browse/ARROW-10136) - [Rust][Arrow] Nulls are transformed into "" after filtering for StringArray +* [ARROW-10137](https://issues.apache.org/jira/browse/ARROW-10137) - [R] Fix cpp helper that breaks if libarrow is not present +* [ARROW-10147](https://issues.apache.org/jira/browse/ARROW-10147) - [Python] Constructing pandas metadata fails if an Index name is not JSON-serializable by default +* [ARROW-10150](https://issues.apache.org/jira/browse/ARROW-10150) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz) +* [ARROW-10169](https://issues.apache.org/jira/browse/ARROW-10169) - [Rust] Nulls should be rendered as "" rather than default value when pretty printing arrays +* [ARROW-10175](https://issues.apache.org/jira/browse/ARROW-10175) - [CI] Nightly hdfs integration test job fails +* [ARROW-10176](https://issues.apache.org/jira/browse/ARROW-10176) - [CI] Nightly valgrind job fails +* [ARROW-10178](https://issues.apache.org/jira/browse/ARROW-10178) - [CI] Fix spark master integration test build setup +* [ARROW-10179](https://issues.apache.org/jira/browse/ARROW-10179) - [Rust] Labeler is not labeling +* [ARROW-10181](https://issues.apache.org/jira/browse/ARROW-10181) - [Rust] Arrow tests fail to compile on Raspberry Pi (32 bit) +* [ARROW-10188](https://issues.apache.org/jira/browse/ARROW-10188) - [Rust] [DataFusion] Some examples are broken +* [ARROW-10189](https://issues.apache.org/jira/browse/ARROW-10189) - [Doc] C data interface example for i32 uses \`l\`, not \`i\`, in the format +* [ARROW-10192](https://issues.apache.org/jira/browse/ARROW-10192) - [C++][Python] Segfault when converting nested struct array with dictionary field to pandas series +* [ARROW-10193](https://issues.apache.org/jira/browse/ARROW-10193) - [Python] Segfault when converting to fixed size binary array +* [ARROW-10200](https://issues.apache.org/jira/browse/ARROW-10200) - [Java][CI] Fix failure of Java CI on s390x +* [ARROW-10204](https://issues.apache.org/jira/browse/ARROW-10204) - [RUST] [Datafusion] Test failure in aggregate\_grouped\_empty with simd feature enabled +* [ARROW-10214](https://issues.apache.org/jira/browse/ARROW-10214) - [Python] UnicodeDecodeError when printing schema with binary metadata +* [ARROW-10226](https://issues.apache.org/jira/browse/ARROW-10226) - [Rust] [Parquet] Parquet reader reading wrong columns in some batches within a parquet file +* [ARROW-10230](https://issues.apache.org/jira/browse/ARROW-10230) - [JS][Doc] JavaScript documentation fails to build +* [ARROW-10232](https://issues.apache.org/jira/browse/ARROW-10232) - FixedSizeListArray is incorrectly written/read to/from parquet +* [ARROW-10234](https://issues.apache.org/jira/browse/ARROW-10234) - [C++][Gandiva] Fix logic of round() for floats/decimals in Gandiva +* [ARROW-10237](https://issues.apache.org/jira/browse/ARROW-10237) - [C++] Duplicate values in a dictionary result in corrupted parquet +* [ARROW-10238](https://issues.apache.org/jira/browse/ARROW-10238) - [C\#] List is broken +* [ARROW-10239](https://issues.apache.org/jira/browse/ARROW-10239) - [C++] aws-sdk-cpp apparently requires zlib too +* [ARROW-10244](https://issues.apache.org/jira/browse/ARROW-10244) - [Python][Docs] Add docs on using pyarrow.dataset.parquet\_dataset +* [ARROW-10248](https://issues.apache.org/jira/browse/ARROW-10248) - [C++][Dataset] Dataset writing does not write schema metadata +* [ARROW-10262](https://issues.apache.org/jira/browse/ARROW-10262) - [C++] Some TypeClass in Scalar classes seem incorrect +* [ARROW-10271](https://issues.apache.org/jira/browse/ARROW-10271) - [Rust] packed\_simd is broken and continued under a new project +* [ARROW-10279](https://issues.apache.org/jira/browse/ARROW-10279) - [Release][Python] Fix verification script to align with the new macos wheel platform tags +* [ARROW-10280](https://issues.apache.org/jira/browse/ARROW-10280) - [Packaging][Python] Fix macOS wheel artifact patterns +* [ARROW-10281](https://issues.apache.org/jira/browse/ARROW-10281) - [Python] Fix warnings when running tests +* [ARROW-10284](https://issues.apache.org/jira/browse/ARROW-10284) - [Python] Pyarrow is raising deprecation warning about filesystems on import +* [ARROW-10285](https://issues.apache.org/jira/browse/ARROW-10285) - [Python] pyarrow.orc submodule is using deprecated functionality +* [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors +* [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386 +* [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions + + +## New Features and Improvements + +* [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections +* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader +* [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages +* [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain +* [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays +* [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs +* [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths +* [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client +* [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility +* [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests +* [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio +* [ARROW-4685](https://issues.apache.org/jira/browse/ARROW-4685) - [C++] Update Boost to 1.69 in manylinux1 docker image +* [ARROW-4927](https://issues.apache.org/jira/browse/ARROW-4927) - [Rust] Update top level README to describe current functionality +* [ARROW-4957](https://issues.apache.org/jira/browse/ARROW-4957) - [Rust] [DataFusion] Implement get\_supertype correctly +* [ARROW-4965](https://issues.apache.org/jira/browse/ARROW-4965) - [Python] Timestamp array type detection should use tzname of datetime.datetime objects +* [ARROW-5034](https://issues.apache.org/jira/browse/ARROW-5034) - [C\#] ArrowStreamWriter should expose synchronous Write methods +* [ARROW-5123](https://issues.apache.org/jira/browse/ARROW-5123) - [Rust] derive RecordWriter from struct definitions +* [ARROW-6075](https://issues.apache.org/jira/browse/ARROW-6075) - [FlightRPC] Handle uncaught exceptions in middleware +* [ARROW-6281](https://issues.apache.org/jira/browse/ARROW-6281) - [Python] Produce chunked arrays for nested types in pyarrow.array +* [ARROW-6282](https://issues.apache.org/jira/browse/ARROW-6282) - [Format] Support lossy compression +* [ARROW-6437](https://issues.apache.org/jira/browse/ARROW-6437) - [R] Add AWS SDK to system dependencies for macOS and Windows +* [ARROW-6535](https://issues.apache.org/jira/browse/ARROW-6535) - [C++] Status::WithMessage should accept variadic parameters +* [ARROW-6537](https://issues.apache.org/jira/browse/ARROW-6537) - [R] Pass column\_types to CSV reader +* [ARROW-6972](https://issues.apache.org/jira/browse/ARROW-6972) - [C\#] Should support StructField arrays +* [ARROW-6982](https://issues.apache.org/jira/browse/ARROW-6982) - [R] Add bindings for compare and boolean kernels +* [ARROW-7136](https://issues.apache.org/jira/browse/ARROW-7136) - [Rust][CI] Pre-install the rust dependencies in the dockerfile +* [ARROW-7218](https://issues.apache.org/jira/browse/ARROW-7218) - [Python] Conversion from boolean numpy scalars not working +* [ARROW-7302](https://issues.apache.org/jira/browse/ARROW-7302) - [C++] CSV: allow converting a column to a specific dictionary type +* [ARROW-7372](https://issues.apache.org/jira/browse/ARROW-7372) - [C++] Allow creating dictionary array from simple JSON +* [ARROW-7871](https://issues.apache.org/jira/browse/ARROW-7871) - [Python] Expose more compute kernels +* [ARROW-7960](https://issues.apache.org/jira/browse/ARROW-7960) - [C++][Parquet] Add support for schema translation from parquet nodes back to arrow for missing types +* [ARROW-8001](https://issues.apache.org/jira/browse/ARROW-8001) - [R][Dataset] Bindings for dataset writing +* [ARROW-8002](https://issues.apache.org/jira/browse/ARROW-8002) - [C++][Dataset] Dataset writing should let you (re)partition the data +* [ARROW-8048](https://issues.apache.org/jira/browse/ARROW-8048) - [Python] Run memory leak tests nightly as follow up to ARROW-4120 +* [ARROW-8172](https://issues.apache.org/jira/browse/ARROW-8172) - [C++] ArrayFromJSON for dictionary arrays +* [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema +* [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs +* [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder +* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer +* [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers +* [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather +* [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes +* [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values +* [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java +* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet +* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types +* [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction. +* [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array reassembly logic +* [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset +* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface +* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface +* [ARROW-8618](https://issues.apache.org/jira/browse/ARROW-8618) - [C++] ASSIGN\_OR\_RAISE should move its argument +* [ARROW-8678](https://issues.apache.org/jira/browse/ARROW-8678) - [C++][Parquet] Remove legacy arrow to level translation. +* [ARROW-8712](https://issues.apache.org/jira/browse/ARROW-8712) - [R] Expose strptime timestamp parsing in read\_csv conversion options +* [ARROW-8774](https://issues.apache.org/jira/browse/ARROW-8774) - [Rust] [DataFusion] Improve threading model +* [ARROW-8810](https://issues.apache.org/jira/browse/ARROW-8810) - [R] Add documentation about Parquet format, appending to stream format +* [ARROW-8824](https://issues.apache.org/jira/browse/ARROW-8824) - [Rust] [DataFusion] Implement new SQL parser +* [ARROW-8828](https://issues.apache.org/jira/browse/ARROW-8828) - [Rust] Implement SQL tokenizer +* [ARROW-8829](https://issues.apache.org/jira/browse/ARROW-8829) - [Rust] Implement SQL parser +* [ARROW-9010](https://issues.apache.org/jira/browse/ARROW-9010) - [Java] Framework and interface changes for RecordBatch IPC buffer compression +* [ARROW-9065](https://issues.apache.org/jira/browse/ARROW-9065) - [C++] Support parsing date32 in dataset partition folders +* [ARROW-9068](https://issues.apache.org/jira/browse/ARROW-9068) - [C++][Dataset] Simplify Partitioning interface +* [ARROW-9078](https://issues.apache.org/jira/browse/ARROW-9078) - [C++] Parquet writing of extension type with nested storage type fails +* [ARROW-9104](https://issues.apache.org/jira/browse/ARROW-9104) - [C++] Parquet encryption tests should write files to a temporary directory instead of the testing submodule's directory +* [ARROW-9107](https://issues.apache.org/jira/browse/ARROW-9107) - [C++][Dataset] Time-based types support +* [ARROW-9147](https://issues.apache.org/jira/browse/ARROW-9147) - [C++][Dataset] Support null -\> other type promotion in Dataset scanning +* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst +* [ARROW-9266](https://issues.apache.org/jira/browse/ARROW-9266) - [Python][Packaging] Enable S3 support in macOS wheels +* [ARROW-9271](https://issues.apache.org/jira/browse/ARROW-9271) - [R] Preserve data frame metadata in round trip +* [ARROW-9286](https://issues.apache.org/jira/browse/ARROW-9286) - [C++] Add function "aliases" to compute::FunctionRegistry +* [ARROW-9328](https://issues.apache.org/jira/browse/ARROW-9328) - [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string +* [ARROW-9338](https://issues.apache.org/jira/browse/ARROW-9338) - [Rust] Add instructions for running clippy locally +* [ARROW-9344](https://issues.apache.org/jira/browse/ARROW-9344) - [C++][Flight] measure latency quantile in flight benchmark +* [ARROW-9358](https://issues.apache.org/jira/browse/ARROW-9358) - [Integration] Reconsider generated\_large\_batch.json +* [ARROW-9371](https://issues.apache.org/jira/browse/ARROW-9371) - [Java] Run vector tests for both allocators +* [ARROW-9377](https://issues.apache.org/jira/browse/ARROW-9377) - [Java] Support unsigned dictionary indices +* [ARROW-9387](https://issues.apache.org/jira/browse/ARROW-9387) - [R] Use new C++ table select method +* [ARROW-9388](https://issues.apache.org/jira/browse/ARROW-9388) - [C++] Division kernels +* [ARROW-9394](https://issues.apache.org/jira/browse/ARROW-9394) - [Python] Support pickling of Scalars +* [ARROW-9398](https://issues.apache.org/jira/browse/ARROW-9398) - [C++] Register the SIMD sum variants under function instance instead a SIMD function +* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends +* [ARROW-9405](https://issues.apache.org/jira/browse/ARROW-9405) - [R] Switch to cpp11 +* [ARROW-9412](https://issues.apache.org/jira/browse/ARROW-9412) - [C++] Add non-BUNDLED dependencies to exported INSTALL\_INTERFACE\_LIBS of arrow\_static and test that it works +* [ARROW-9429](https://issues.apache.org/jira/browse/ARROW-9429) - [Python] ChunkedArray.to\_numpy +* [ARROW-9454](https://issues.apache.org/jira/browse/ARROW-9454) - [GLib] Add binding of some dictionary builders +* [ARROW-9465](https://issues.apache.org/jira/browse/ARROW-9465) - [Python] Improve ergonomics of compute functions +* [ARROW-9469](https://issues.apache.org/jira/browse/ARROW-9469) - [Python] Make more objects weakrefable +* [ARROW-9487](https://issues.apache.org/jira/browse/ARROW-9487) - [Developer] Cover the archery release utilities with unittests +* [ARROW-9488](https://issues.apache.org/jira/browse/ARROW-9488) - [Release] Use the new changelog generation when updating the website +* [ARROW-9507](https://issues.apache.org/jira/browse/ARROW-9507) - [Rust] [DataFusion] PhysicalExpr should implement Display trait +* [ARROW-9508](https://issues.apache.org/jira/browse/ARROW-9508) - [Release][APT][Yum] Enable verification for arm64 binaries +* [ARROW-9516](https://issues.apache.org/jira/browse/ARROW-9516) - [Rust][DataFusion] Refactor physical expressions to not care about their names nor indexes +* [ARROW-9517](https://issues.apache.org/jira/browse/ARROW-9517) - [C++][Python] Allow session\_token argument when initializing S3FileSystem +* [ARROW-9518](https://issues.apache.org/jira/browse/ARROW-9518) - [Python] Deprecate pyarrow serialization +* [ARROW-9521](https://issues.apache.org/jira/browse/ARROW-9521) - [Rust] CsvReadOptions should allow file extension to be specified +* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel +* [ARROW-9534](https://issues.apache.org/jira/browse/ARROW-9534) - [Rust] [DataFusion] Implement functions for creating literal expressions for all types +* [ARROW-9550](https://issues.apache.org/jira/browse/ARROW-9550) - [Rust] [DataFusion] Remove Rc\> from hash aggregate operator +* [ARROW-9553](https://issues.apache.org/jira/browse/ARROW-9553) - [Rust] Release script doesn't bump parquet crate's arrow dependency version +* [ARROW-9557](https://issues.apache.org/jira/browse/ARROW-9557) - [R] Iterating over parquet columns is slow in R +* [ARROW-9559](https://issues.apache.org/jira/browse/ARROW-9559) - [Rust] [DataFusion] Revert privatization of exprlist\_to\_fields +* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website +* [ARROW-9568](https://issues.apache.org/jira/browse/ARROW-9568) - [CI] Use official msys action on GHA +* [ARROW-9576](https://issues.apache.org/jira/browse/ARROW-9576) - [Python][Doc] Fix error in code example for extension types +* [ARROW-9580](https://issues.apache.org/jira/browse/ARROW-9580) - [JS] Docs have superfluous () +* [ARROW-9581](https://issues.apache.org/jira/browse/ARROW-9581) - [Dev][Release] Bump next snapshot versions to 2.0.0 +* [ARROW-9582](https://issues.apache.org/jira/browse/ARROW-9582) - [Rust] Implement Array::memory\_size() +* [ARROW-9585](https://issues.apache.org/jira/browse/ARROW-9585) - [Rust] Remove duplicated to-do line in DataFusion readme +* [ARROW-9587](https://issues.apache.org/jira/browse/ARROW-9587) - [FlightRPC][Java] Clean up DoPut/FlightStream memory handling +* [ARROW-9593](https://issues.apache.org/jira/browse/ARROW-9593) - [Python] Add custom pickle reducers for DictionaryScalar +* [ARROW-9604](https://issues.apache.org/jira/browse/ARROW-9604) - [C++] Add benchmark for aggregate min/max compute kernels +* [ARROW-9605](https://issues.apache.org/jira/browse/ARROW-9605) - [C++] Optimize performance for aggregate min/max compute kernels +* [ARROW-9607](https://issues.apache.org/jira/browse/ARROW-9607) - [C++][Gandiva] Add bitwise\_and(), bitwise\_or() and bitwise\_not() functions for integers +* [ARROW-9608](https://issues.apache.org/jira/browse/ARROW-9608) - [Rust] Remove arrow flight from parquet's feature gating +* [ARROW-9615](https://issues.apache.org/jira/browse/ARROW-9615) - [Rust] Add kernel to compute length of string array +* [ARROW-9617](https://issues.apache.org/jira/browse/ARROW-9617) - [Rust] [DataFusion] Add length of string array +* [ARROW-9618](https://issues.apache.org/jira/browse/ARROW-9618) - [Rust] [DataFusion] Make it easier to write optimizers +* [ARROW-9619](https://issues.apache.org/jira/browse/ARROW-9619) - [Rust] [DataFusion] Add predicate push-down +* [ARROW-9632](https://issues.apache.org/jira/browse/ARROW-9632) - [Rust] Add a "new" method for ExecutionContextSchemaProvider +* [ARROW-9638](https://issues.apache.org/jira/browse/ARROW-9638) - [C++][Compute] Implement mode(most frequent number) kernel +* [ARROW-9639](https://issues.apache.org/jira/browse/ARROW-9639) - [Ruby] Add dependency version check +* [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers +* [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers +* [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface +* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets +* [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10 +* [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI +* [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE +* [ARROW-9658](https://issues.apache.org/jira/browse/ARROW-9658) - [Python][Dataset] Bindings for dataset writing +* [ARROW-9665](https://issues.apache.org/jira/browse/ARROW-9665) - [R] head/tail/take for Datasets +* [ARROW-9667](https://issues.apache.org/jira/browse/ARROW-9667) - [CI][Crossbow] Segfault in 2 nightly R builds +* [ARROW-9671](https://issues.apache.org/jira/browse/ARROW-9671) - [C++] BasicDecimal128 constructor interprets uint64\_t integers with highest bit set as negative +* [ARROW-9673](https://issues.apache.org/jira/browse/ARROW-9673) - [Rust] Add a param "dialect" for DFParser::parse\_sql +* [ARROW-9678](https://issues.apache.org/jira/browse/ARROW-9678) - [Rust] [DataFusion] Improve projection push down to remove unused columns +* [ARROW-9679](https://issues.apache.org/jira/browse/ARROW-9679) - [Rust] [DataFusion] HashAggregate walks map many times building final batch +* [ARROW-9681](https://issues.apache.org/jira/browse/ARROW-9681) - [Java] Failed Arrow Memory - Core on big-endian platform +* [ARROW-9683](https://issues.apache.org/jira/browse/ARROW-9683) - [Rust][DataFusion] Implement Debug for ExecutionPlan trait +* [ARROW-9691](https://issues.apache.org/jira/browse/ARROW-9691) - [Rust] [DataFusion] Make sql\_statement\_to\_plan public +* [ARROW-9695](https://issues.apache.org/jira/browse/ARROW-9695) - [Rust][DataFusion] Improve documentation on LogicalPlan variants +* [ARROW-9699](https://issues.apache.org/jira/browse/ARROW-9699) - [C++][Compute] Improve mode kernel performance for small integer types +* [ARROW-9701](https://issues.apache.org/jira/browse/ARROW-9701) - [Java][CI] Add a test job on s390x +* [ARROW-9702](https://issues.apache.org/jira/browse/ARROW-9702) - [C++] Move bpacking simd to runtime path +* [ARROW-9703](https://issues.apache.org/jira/browse/ARROW-9703) - [Developer][Archery] Restartable cherry-picking process for creating maintenance branches +* [ARROW-9706](https://issues.apache.org/jira/browse/ARROW-9706) - [Java] Tests in TestLargeListVector fails on big endian platform +* [ARROW-9710](https://issues.apache.org/jira/browse/ARROW-9710) - [C++] Generalize Decimal ToString in preparation for Decimal256 +* [ARROW-9711](https://issues.apache.org/jira/browse/ARROW-9711) - [Rust] Add benchmark based on TPC-H +* [ARROW-9713](https://issues.apache.org/jira/browse/ARROW-9713) - [Rust][DataFusion] Remove explicit panics +* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1 +* [ARROW-9718](https://issues.apache.org/jira/browse/ARROW-9718) - [Python] Make pyarrow.parquet work with the new filesystem interfaces +* [ARROW-9721](https://issues.apache.org/jira/browse/ARROW-9721) - [Packaging][Python] Update wheel dependency files +* [ARROW-9722](https://issues.apache.org/jira/browse/ARROW-9722) - [Rust]: Shorten key lifetime for reverse lookup for dictionary arrays +* [ARROW-9723](https://issues.apache.org/jira/browse/ARROW-9723) - [C++] Expected behaviour of "mode" kernel with NaNs ? +* [ARROW-9725](https://issues.apache.org/jira/browse/ARROW-9725) - [Rust] [DataFusion] LimitExec and SortExec should use MergeExec +* [ARROW-9737](https://issues.apache.org/jira/browse/ARROW-9737) - [C++][Gandiva] Add bitwise\_xor() for integers +* [ARROW-9739](https://issues.apache.org/jira/browse/ARROW-9739) - [CI][Ruby] Don't install gem documents +* [ARROW-9742](https://issues.apache.org/jira/browse/ARROW-9742) - [Rust] Create one standard DataFrame API +* [ARROW-9751](https://issues.apache.org/jira/browse/ARROW-9751) - [Rust] [DataFusion] Extend UDFs to accept more than one type per argument +* [ARROW-9752](https://issues.apache.org/jira/browse/ARROW-9752) - [Rust] [DataFusion] Add support for Aggregate UDFs +* [ARROW-9753](https://issues.apache.org/jira/browse/ARROW-9753) - [Rust] [DataFusion] Remove the use of Mutex in ExecutionPlan trait +* [ARROW-9754](https://issues.apache.org/jira/browse/ARROW-9754) - [Rust] [DataFusion] Implement async in DataFusion traits +* [ARROW-9757](https://issues.apache.org/jira/browse/ARROW-9757) - [Rust] [DataFusion] Use "pub use" to expose a clean public API +* [ARROW-9758](https://issues.apache.org/jira/browse/ARROW-9758) - [Rust] [DataFusion] Implement extension API for DataFusion +* [ARROW-9759](https://issues.apache.org/jira/browse/ARROW-9759) - [Rust] [DataFusion] Implement DataFrame::sort +* [ARROW-9760](https://issues.apache.org/jira/browse/ARROW-9760) - [Rust] [DataFusion] Implement DataFrame::explain +* [ARROW-9761](https://issues.apache.org/jira/browse/ARROW-9761) - [C++] Add experimental pull-based iterator structures to C interface implementation +* [ARROW-9762](https://issues.apache.org/jira/browse/ARROW-9762) - [Rust] [DataFusion] ExecutionContext::sql should return DataFrame +* [ARROW-9769](https://issues.apache.org/jira/browse/ARROW-9769) - [Python] Remove skip for in-memory fsspec in test\_move\_file +* [ARROW-9775](https://issues.apache.org/jira/browse/ARROW-9775) - [C++] Automatic S3 region selection +* [ARROW-9781](https://issues.apache.org/jira/browse/ARROW-9781) - [C++] Fix uninitialized value warnings +* [ARROW-9782](https://issues.apache.org/jira/browse/ARROW-9782) - [C++][Dataset] Ability to write ".feather" files with IpcFileFormat +* [ARROW-9784](https://issues.apache.org/jira/browse/ARROW-9784) - [Rust] [DataFusion] Improve instructions for running tpch benchmark +* [ARROW-9786](https://issues.apache.org/jira/browse/ARROW-9786) - [R] Unvendor cpp11 before release +* [ARROW-9788](https://issues.apache.org/jira/browse/ARROW-9788) - Handle naming inconsistencies between SQL, DataFrame API and struct names +* [ARROW-9792](https://issues.apache.org/jira/browse/ARROW-9792) - [Rust] [DataFusion] Logical aggregate functions should not return Result +* [ARROW-9794](https://issues.apache.org/jira/browse/ARROW-9794) - [C++] Add functionality to cpu\_info to discriminate between Intel vs AMD x86 +* [ARROW-9795](https://issues.apache.org/jira/browse/ARROW-9795) - [C++][Gandiva] Implement castTIMESTAMP(int64) in Gandiva +* [ARROW-9806](https://issues.apache.org/jira/browse/ARROW-9806) - [R] More compute kernel bindings +* [ARROW-9807](https://issues.apache.org/jira/browse/ARROW-9807) - [R] News update/version bump post-1.0.1 +* [ARROW-9808](https://issues.apache.org/jira/browse/ARROW-9808) - [Python] parquet.read\_table docstring wrong use\_legacy\_dataset explanation +* [ARROW-9811](https://issues.apache.org/jira/browse/ARROW-9811) - [C++] Unchecked floating point division by 0 should succeed +* [ARROW-9813](https://issues.apache.org/jira/browse/ARROW-9813) - [C++] Disable semantic interposition +* [ARROW-9819](https://issues.apache.org/jira/browse/ARROW-9819) - [C++] Bump mimalloc to 1.6.4 +* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API +* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API +* [ARROW-9823](https://issues.apache.org/jira/browse/ARROW-9823) - [CI][C++][MinGW] Enable S3 +* [ARROW-9832](https://issues.apache.org/jira/browse/ARROW-9832) - [Rust] [DataFusion] Refactor PhysicalPlan to remove Partition +* [ARROW-9833](https://issues.apache.org/jira/browse/ARROW-9833) - [Rust] [DataFusion] Refactor TableProvider.scan to return ExecutionPlan +* [ARROW-9834](https://issues.apache.org/jira/browse/ARROW-9834) - [Rust] [DataFusion] Remove Partition trait +* [ARROW-9835](https://issues.apache.org/jira/browse/ARROW-9835) - [Rust] [DataFusion] Remove FunctionMeta +* [ARROW-9836](https://issues.apache.org/jira/browse/ARROW-9836) - [Rust] [DataFusion] Improve API for usage of UDFs +* [ARROW-9837](https://issues.apache.org/jira/browse/ARROW-9837) - [Rust] Add provider for variable +* [ARROW-9838](https://issues.apache.org/jira/browse/ARROW-9838) - [Rust] [DataFusion] DefaultPhysicalPlanner should insert explicit MergeExec nodes +* [ARROW-9839](https://issues.apache.org/jira/browse/ARROW-9839) - [Rust] [DataFusion] Add ability to downcast ExecutionPlan to specific operator +* [ARROW-9841](https://issues.apache.org/jira/browse/ARROW-9841) - [Rust] Update checked-in flatbuffer files +* [ARROW-9844](https://issues.apache.org/jira/browse/ARROW-9844) - [Go][CI] Add Travis CI job for Go on s390x +* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies +* [ARROW-9848](https://issues.apache.org/jira/browse/ARROW-9848) - [Rust] Implement changes to ensure flatbuffer alignment +* [ARROW-9849](https://issues.apache.org/jira/browse/ARROW-9849) - [Rust] [DataFusion] Make UDFs not need a Field +* [ARROW-9850](https://issues.apache.org/jira/browse/ARROW-9850) - [Go] Defer should not be used in the loop +* [ARROW-9853](https://issues.apache.org/jira/browse/ARROW-9853) - [RUST] Implement "take" kernel for dictionary arrays +* [ARROW-9854](https://issues.apache.org/jira/browse/ARROW-9854) - [R] Support reading/writing data to/from S3 +* [ARROW-9858](https://issues.apache.org/jira/browse/ARROW-9858) - [C++][Python][Docs] Expand user guide for FileSystem +* [ARROW-9863](https://issues.apache.org/jira/browse/ARROW-9863) - [C++] [PARQUET] Optimize meta data recovery of ApplicationVersion +* [ARROW-9867](https://issues.apache.org/jira/browse/ARROW-9867) - [C++][Dataset] FileSystemDataset should expose its filesystem +* [ARROW-9868](https://issues.apache.org/jira/browse/ARROW-9868) - [C++] Provide utility for copying files between filesystems +* [ARROW-9869](https://issues.apache.org/jira/browse/ARROW-9869) - [R] Implement full S3FileSystem/S3Options constructor +* [ARROW-9870](https://issues.apache.org/jira/browse/ARROW-9870) - [R] Friendly interface for filesystems (S3) +* [ARROW-9871](https://issues.apache.org/jira/browse/ARROW-9871) - [C++] Add uppercase support to ARROW\_USER\_SIMD\_LEVEL. +* [ARROW-9873](https://issues.apache.org/jira/browse/ARROW-9873) - [C++][Compute] Improve mode kernel for intergers within limited value range +* [ARROW-9875](https://issues.apache.org/jira/browse/ARROW-9875) - [Python] Let FileSystem.get\_file\_info accept a single path +* [ARROW-9884](https://issues.apache.org/jira/browse/ARROW-9884) - [R] Bindings for writing datasets to Parquet +* [ARROW-9885](https://issues.apache.org/jira/browse/ARROW-9885) - [Rust] [DataFusion] Simplify code of type coercion for binary types +* [ARROW-9886](https://issues.apache.org/jira/browse/ARROW-9886) - [Rust] [DataFusion] Simplify code to test cast +* [ARROW-9887](https://issues.apache.org/jira/browse/ARROW-9887) - [Rust] [DataFusion] Add support for complex return types of built-in functions +* [ARROW-9890](https://issues.apache.org/jira/browse/ARROW-9890) - [R] Add zstandard compression codec in macOS build +* [ARROW-9891](https://issues.apache.org/jira/browse/ARROW-9891) - [Rust] [DataFusion] Make math functions support f32 +* [ARROW-9892](https://issues.apache.org/jira/browse/ARROW-9892) - [Rust] [DataFusion] Add support for concat +* [ARROW-9893](https://issues.apache.org/jira/browse/ARROW-9893) - [Python] Bindings for writing datasets to Parquet +* [ARROW-9895](https://issues.apache.org/jira/browse/ARROW-9895) - [RUST] Improve sort kernels +* [ARROW-9899](https://issues.apache.org/jira/browse/ARROW-9899) - [Rust] [DataFusion] Switch from Box --\> SchemaRef (Arc) to be consistent with the rest of Arrow +* [ARROW-9900](https://issues.apache.org/jira/browse/ARROW-9900) - [Rust][DataFusion] Use Arc<\> instead of Box<\> in LogicalPlan +* [ARROW-9901](https://issues.apache.org/jira/browse/ARROW-9901) - [C++] Add hand-crafted Parquet to Arrow reconstruction test for nested reading +* [ARROW-9902](https://issues.apache.org/jira/browse/ARROW-9902) - [Rust] [DataFusion] Add support for array() +* [ARROW-9904](https://issues.apache.org/jira/browse/ARROW-9904) - [C++] Unroll the loop manually for CountSetBits +* [ARROW-9908](https://issues.apache.org/jira/browse/ARROW-9908) - [Rust] Support temporal data types in JSON reader +* [ARROW-9910](https://issues.apache.org/jira/browse/ARROW-9910) - [Rust] [DataFusion] Type coercion of Variadic is wrong +* [ARROW-9914](https://issues.apache.org/jira/browse/ARROW-9914) - [Rust][DataFusion] Document the SQL -\> Arrow type mapping +* [ARROW-9916](https://issues.apache.org/jira/browse/ARROW-9916) - [RUST] Avoid cloning ArrayData in several places +* [ARROW-9917](https://issues.apache.org/jira/browse/ARROW-9917) - [Python][Compute] Add bindings for mode kernel +* [ARROW-9919](https://issues.apache.org/jira/browse/ARROW-9919) - [Rust] [DataFusion] Math functions +* [ARROW-9921](https://issues.apache.org/jira/browse/ARROW-9921) - [Rust] Add \`from(Vec\>)\` to [Large]StringArray +* [ARROW-9925](https://issues.apache.org/jira/browse/ARROW-9925) - [GLib] Add low level value readers for GArrowListArray family +* [ARROW-9926](https://issues.apache.org/jira/browse/ARROW-9926) - [GLib] Use placement new for GArrowRecordBatchFileReader +* [ARROW-9928](https://issues.apache.org/jira/browse/ARROW-9928) - [C++] Speed up integer parsing slightly +* [ARROW-9929](https://issues.apache.org/jira/browse/ARROW-9929) - [Developer] Autotune cmake-format +* [ARROW-9933](https://issues.apache.org/jira/browse/ARROW-9933) - [Developer] Add drone as a CI provider for crossbow +* [ARROW-9934](https://issues.apache.org/jira/browse/ARROW-9934) - [Rust] Shape and stride check in tensor +* [ARROW-9941](https://issues.apache.org/jira/browse/ARROW-9941) - [Python] Better string representation for extension types +* [ARROW-9944](https://issues.apache.org/jira/browse/ARROW-9944) - [Rust] Implement TO\_TIMESTAMP function +* [ARROW-9949](https://issues.apache.org/jira/browse/ARROW-9949) - [C++] Generalize Decimal128::FromString for reuse in Decimal256 +* [ARROW-9950](https://issues.apache.org/jira/browse/ARROW-9950) - [Rust] [DataFusion] Allow UDF usage without registry +* [ARROW-9952](https://issues.apache.org/jira/browse/ARROW-9952) - [Python] Use pyarrow.dataset writing for pq.write\_to\_dataset +* [ARROW-9954](https://issues.apache.org/jira/browse/ARROW-9954) - [Rust] [DataFusion] Simplify code of aggregate planning +* [ARROW-9956](https://issues.apache.org/jira/browse/ARROW-9956) - [C++][Gandiva] Implement Binary string function in Gandiva +* [ARROW-9957](https://issues.apache.org/jira/browse/ARROW-9957) - [Rust] Remove unmaintained tempdir dependency +* [ARROW-9961](https://issues.apache.org/jira/browse/ARROW-9961) - [Rust][DataFusion] to\_timestamp function parses timestamp without timezone offset as UTC rather than local +* [ARROW-9964](https://issues.apache.org/jira/browse/ARROW-9964) - [C++] CSV date support +* [ARROW-9965](https://issues.apache.org/jira/browse/ARROW-9965) - [Java] Buffer capacity calculations are slow for fixed-width vectors +* [ARROW-9966](https://issues.apache.org/jira/browse/ARROW-9966) - [Rust] Speedup aggregate kernels +* [ARROW-9967](https://issues.apache.org/jira/browse/ARROW-9967) - [Python] Add compute module docs +* [ARROW-9971](https://issues.apache.org/jira/browse/ARROW-9971) - [Rust] Speedup take +* [ARROW-9977](https://issues.apache.org/jira/browse/ARROW-9977) - [Rust] Add min/max for [Large]String +* [ARROW-9979](https://issues.apache.org/jira/browse/ARROW-9979) - [Rust] Fix arrow crate clippy lints +* [ARROW-9980](https://issues.apache.org/jira/browse/ARROW-9980) - [Rust] Fix parquet crate clippy lints +* [ARROW-9981](https://issues.apache.org/jira/browse/ARROW-9981) - [Rust] Allow configuring flight IPC with IpcWriteOptions +* [ARROW-9983](https://issues.apache.org/jira/browse/ARROW-9983) - [C++][Dataset][Python] Use larger default batch size than 32K for Datasets API +* [ARROW-9984](https://issues.apache.org/jira/browse/ARROW-9984) - [Rust] [DataFusion] DRY of function to string +* [ARROW-9986](https://issues.apache.org/jira/browse/ARROW-9986) - [Rust][DataFusion] TO\_TIMESTAMP function erroneously requires fractional seconds when no timezone is present +* [ARROW-9987](https://issues.apache.org/jira/browse/ARROW-9987) - [Rust] [DataFusion] Improve docs of \`Expr\`. +* [ARROW-9988](https://issues.apache.org/jira/browse/ARROW-9988) - [Rust] [DataFusion] Added std::ops to logical expressions +* [ARROW-9992](https://issues.apache.org/jira/browse/ARROW-9992) - [C++][Python] Refactor python to arrow conversions based on a reusable conversion API +* [ARROW-9998](https://issues.apache.org/jira/browse/ARROW-9998) - [Python] Support pickling DictionaryScalar +* [ARROW-9999](https://issues.apache.org/jira/browse/ARROW-9999) - [Python] Support constructing dictionary array directly through pa.array() +* [ARROW-10000](https://issues.apache.org/jira/browse/ARROW-10000) - [C++][Python] Support constructing StructArray from list of key-value pairs +* [ARROW-10001](https://issues.apache.org/jira/browse/ARROW-10001) - [Rust] [DataFusion] Add developer guide to README +* [ARROW-10010](https://issues.apache.org/jira/browse/ARROW-10010) - [Rust] Speedup arithmetic +* [ARROW-10015](https://issues.apache.org/jira/browse/ARROW-10015) - [Rust] Implement SIMD for aggregate kernel sum +* [ARROW-10016](https://issues.apache.org/jira/browse/ARROW-10016) - [Rust] [DataFusion] Implement IsNull and IsNotNull +* [ARROW-10018](https://issues.apache.org/jira/browse/ARROW-10018) - [CI] Disable Sphinx and API documentation build since it takes 6 hours on master +* [ARROW-10019](https://issues.apache.org/jira/browse/ARROW-10019) - [Rust] Add substring kernel +* [ARROW-10023](https://issues.apache.org/jira/browse/ARROW-10023) - [Gandiva][C++] Implementing Split part function in gandiva +* [ARROW-10024](https://issues.apache.org/jira/browse/ARROW-10024) - [C++][Parquet] Create nested reading benchmarks +* [ARROW-10028](https://issues.apache.org/jira/browse/ARROW-10028) - [Rust] Simplify macro def\_numeric\_from\_vec +* [ARROW-10030](https://issues.apache.org/jira/browse/ARROW-10030) - [Rust] Support fromIter and toIter +* [ARROW-10035](https://issues.apache.org/jira/browse/ARROW-10035) - [C++] Bump versions of vendored code +* [ARROW-10037](https://issues.apache.org/jira/browse/ARROW-10037) - [C++] Workaround to force find AWS SDK to look for shared libraries +* [ARROW-10040](https://issues.apache.org/jira/browse/ARROW-10040) - [Rust] Create a way to slice unalligned offset buffers +* [ARROW-10043](https://issues.apache.org/jira/browse/ARROW-10043) - [Rust] [DataFusion] Introduce support for DISTINCT by partially implementing COUNT(DISTINCT) +* [ARROW-10044](https://issues.apache.org/jira/browse/ARROW-10044) - [Rust] Improve README +* [ARROW-10046](https://issues.apache.org/jira/browse/ARROW-10046) - [Rust] [DataFusion] Made \`\*Iterator\` implement Iterator +* [ARROW-10050](https://issues.apache.org/jira/browse/ARROW-10050) - [C++][Gandiva] Implement concat() in Gandiva for up to 10 arguments +* [ARROW-10051](https://issues.apache.org/jira/browse/ARROW-10051) - [C++][Compute] Make aggregate kernel merge state mutable +* [ARROW-10054](https://issues.apache.org/jira/browse/ARROW-10054) - [Python] Slice methods should return empty arrays instead of crashing +* [ARROW-10055](https://issues.apache.org/jira/browse/ARROW-10055) - [Rust] Implement DoubleEndedIterator for NullableIter +* [ARROW-10057](https://issues.apache.org/jira/browse/ARROW-10057) - [C++] Add Parquet-Arrow roundtrip tests for nested data +* [ARROW-10058](https://issues.apache.org/jira/browse/ARROW-10058) - [C++] Investigate performance of LevelsToBitmap without BMI2 +* [ARROW-10059](https://issues.apache.org/jira/browse/ARROW-10059) - [R][Doc] Give more advice on how to set up C++ build +* [ARROW-10063](https://issues.apache.org/jira/browse/ARROW-10063) - [Archery][CI] Fetch main branch in archery build only when it is a pull request +* [ARROW-10064](https://issues.apache.org/jira/browse/ARROW-10064) - [C++] Resolve compile warnings on Apple Clang 12 +* [ARROW-10065](https://issues.apache.org/jira/browse/ARROW-10065) - [Rust] DRY downcasted Arrays +* [ARROW-10066](https://issues.apache.org/jira/browse/ARROW-10066) - [C++] Make sure that default AWS region is respected +* [ARROW-10068](https://issues.apache.org/jira/browse/ARROW-10068) - [C++] Add bundled external project for aws-sdk-cpp +* [ARROW-10069](https://issues.apache.org/jira/browse/ARROW-10069) - [Java] Support running Java benchmarks from command line +* [ARROW-10070](https://issues.apache.org/jira/browse/ARROW-10070) - [C++][Compute] Implement stdev aggregate kernel +* [ARROW-10071](https://issues.apache.org/jira/browse/ARROW-10071) - [R] segfault with ArrowObject from previous session, or saved +* [ARROW-10074](https://issues.apache.org/jira/browse/ARROW-10074) - [C++] Don't use string\_view.to\_string() +* [ARROW-10075](https://issues.apache.org/jira/browse/ARROW-10075) - [C++] Don't use nonstd::nullopt this breaks out vendoring abstraction. +* [ARROW-10076](https://issues.apache.org/jira/browse/ARROW-10076) - [C++] Use TemporaryDir for all tests that don't already use it. +* [ARROW-10077](https://issues.apache.org/jira/browse/ARROW-10077) - [C++] Potential overflow in bit\_stream\_utils.h multiplication. +* [ARROW-10083](https://issues.apache.org/jira/browse/ARROW-10083) - [C++] Improve Parquet fuzz seed corpus +* [ARROW-10084](https://issues.apache.org/jira/browse/ARROW-10084) - [Rust] [DataFusion] Add length of large string array +* [ARROW-10086](https://issues.apache.org/jira/browse/ARROW-10086) - [Rust] Migrate min\_large\_string -\> min\_string kernels +* [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel +* [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list +* [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion +* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes +* [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code +* [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded +* [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids +* [ARROW-10102](https://issues.apache.org/jira/browse/ARROW-10102) - [C++] Generalize BasicDecimal128::operator\*= for reuse in Decimal256 +* [ARROW-10103](https://issues.apache.org/jira/browse/ARROW-10103) - [Rust] Add a Contains kernel +* [ARROW-10105](https://issues.apache.org/jira/browse/ARROW-10105) - [FlightRPC] Add client option to disable certificate validation with TLS +* [ARROW-10120](https://issues.apache.org/jira/browse/ARROW-10120) - [C++][Parquet] Create reading benchmarks for 2-level nested data +* [ARROW-10127](https://issues.apache.org/jira/browse/ARROW-10127) - [Format] Update specification to support 256-bit Decimal types +* [ARROW-10129](https://issues.apache.org/jira/browse/ARROW-10129) - [Rust] Cargo build is rebuilding dependencies on arrow changes +* [ARROW-10134](https://issues.apache.org/jira/browse/ARROW-10134) - [C++][Dataset] Add ParquetFileFragment::num\_row\_groups property +* [ARROW-10139](https://issues.apache.org/jira/browse/ARROW-10139) - [C++] Add support for building arrow\_testing without building tests +* [ARROW-10148](https://issues.apache.org/jira/browse/ARROW-10148) - [Rust] Add documentation to lib.rs +* [ARROW-10151](https://issues.apache.org/jira/browse/ARROW-10151) - [Python] Add support MapArray to\_pandas conversion +* [ARROW-10155](https://issues.apache.org/jira/browse/ARROW-10155) - [Rust] [DataFusion] Add documentation to lib.rs +* [ARROW-10156](https://issues.apache.org/jira/browse/ARROW-10156) - [Rust] Auto-label PRs +* [ARROW-10157](https://issues.apache.org/jira/browse/ARROW-10157) - [Rust] Add more documentation about take +* [ARROW-10160](https://issues.apache.org/jira/browse/ARROW-10160) - [Rust] Improve documentation of DictionaryType +* [ARROW-10161](https://issues.apache.org/jira/browse/ARROW-10161) - [Rust] [DataFusion] Simplify expression tests +* [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing +* [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels +* [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs +* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields +* [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from\` +* [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers +* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches +* [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk() +* [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints +* [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI +* [ARROW-10202](https://issues.apache.org/jira/browse/ARROW-10202) - [CI][Windows] Use sf.net mirror for MSYS2 +* [ARROW-10205](https://issues.apache.org/jira/browse/ARROW-10205) - [Java][FlightRPC] Add client option to disable server verification +* [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation +* [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef +* [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs +* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests +* [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size +* [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement. +* [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image +* [ARROW-10233](https://issues.apache.org/jira/browse/ARROW-10233) - [Rust] Make array\_value\_to\_string available in all Arrow builds +* [ARROW-10235](https://issues.apache.org/jira/browse/ARROW-10235) - [Rust][DataFusion] Improve documentation for type coercion +* [ARROW-10240](https://issues.apache.org/jira/browse/ARROW-10240) - [Rust] [Datafusion] Optionally load tpch data into memory before running benchmark query +* [ARROW-10251](https://issues.apache.org/jira/browse/ARROW-10251) - [Rust] [DataFusion] MemTable::load() should load partitions in parallel +* [ARROW-10252](https://issues.apache.org/jira/browse/ARROW-10252) - [Python] Add option to skip inclusion of Arrow headers in Python installation +* [ARROW-10256](https://issues.apache.org/jira/browse/ARROW-10256) - [C++][Flight] Disable -Werror carefully +* [ARROW-10257](https://issues.apache.org/jira/browse/ARROW-10257) - [R] Prepare news/docs for 2.0 release +* [ARROW-10260](https://issues.apache.org/jira/browse/ARROW-10260) - [Python] Missing MapType to Pandas dtype +* [ARROW-10265](https://issues.apache.org/jira/browse/ARROW-10265) - [CI] Use smaler build when cache doesn't exit on Travis CI +* [ARROW-10266](https://issues.apache.org/jira/browse/ARROW-10266) - [CI][macOS] Ensure using Python 3.8 with Homebrew +* [ARROW-10267](https://issues.apache.org/jira/browse/ARROW-10267) - [Python] Skip flight test if disable\_server\_verification feature is not available +* [ARROW-10272](https://issues.apache.org/jira/browse/ARROW-10272) - [Packaging][Python] Pin newer multibuild version to avoid updating homebrew +* [ARROW-10273](https://issues.apache.org/jira/browse/ARROW-10273) - [CI][Homebrew] Fix "brew audit" usage +* [ARROW-10287](https://issues.apache.org/jira/browse/ARROW-10287) - [C++] Avoid std::random\_device whenever possible +* [PARQUET-1845](https://issues.apache.org/jira/browse/PARQUET-1845) - [C++] Int96 memory images in test cases assume only little-endian +* [PARQUET-1878](https://issues.apache.org/jira/browse/PARQUET-1878) - [C++] lz4 codec is not compatible with Hadoop Lz4Codec +* [PARQUET-1904](https://issues.apache.org/jira/browse/PARQUET-1904) - [C++] Export file\_offset in RowGroupMetaData + + + # Apache Arrow 1.0.0 (2020-07-20) ## Bug Fixes diff --git a/LICENSE.txt b/LICENSE.txt index e79841d212d..a37ca36c3f0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -849,9 +849,9 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -The files in cpp/src/arrow/vendored/utf8cpp/ have the following license +The files in cpp/src/arrow/vendored/utfcpp/ have the following license -Copyright 2006 Nemanja Trifunovic +Copyright 2006-2018 Nemanja Trifunovic Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by @@ -2223,3 +2223,11 @@ exception of some code pulled in from other repositories (such as public domain, released using the CC0 1.0 Universal dedication (*). (*) https://creativecommons.org/publicdomain/zero/1.0/legalcode + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/fast_float/ contain code from + +https://github.com/lemire/fast_float + +which is made available under the Apache License 2.0. diff --git a/README.md b/README.md index b1e96dccc6b..133018c72df 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Major components of the project include: a standard and efficient in-memory representation of various datatypes, plain or nested - [The Arrow IPC Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst#serialization-and-interprocess-communication-ipc): an efficient serialization of the Arrow format and associated metadata, - for communication between processes and heterogenous environments + for communication between processes and heterogeneous environments - [The Arrow Flight RPC protocol](https://github.com/apache/arrow/tree/master/format/Flight.proto): based on the Arrow IPC format, a building block for remote services exchanging Arrow data with application-defined semantics (for example a storage server or a database) @@ -60,7 +60,7 @@ Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn ## What's in the Arrow libraries? -The reference Arrow libraries contain a number of distinct software components: +The reference Arrow libraries contain many distinct software components: - Columnar vector and table-like containers (similar to data frames) supporting flat or nested types diff --git a/c_glib/arrow-glib/codec.cpp b/c_glib/arrow-glib/codec.cpp index fdd61e70a17..33b3d1c9149 100644 --- a/c_glib/arrow-glib/codec.cpp +++ b/c_glib/arrow-glib/codec.cpp @@ -38,7 +38,7 @@ G_BEGIN_DECLS */ typedef struct GArrowCodecPrivate_ { - arrow::util::Codec *codec; + std::shared_ptr codec; } GArrowCodecPrivate; enum { @@ -57,7 +57,7 @@ garrow_codec_finalize(GObject *object) { auto priv = GARROW_CODEC_GET_PRIVATE(object); - delete priv->codec; + priv->codec.~shared_ptr(); G_OBJECT_CLASS(garrow_codec_parent_class)->finalize(object); } @@ -72,7 +72,8 @@ garrow_codec_set_property(GObject *object, switch (prop_id) { case PROP_CODEC: - priv->codec = static_cast(g_value_get_pointer(value)); + priv->codec = + *static_cast *>(g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -96,6 +97,8 @@ garrow_codec_get_property(GObject *object, static void garrow_codec_init(GArrowCodec *object) { + auto priv = GARROW_CODEC_GET_PRIVATE(object); + new(&priv->codec) std::shared_ptr; } static void @@ -111,7 +114,7 @@ garrow_codec_class_init(GArrowCodecClass *klass) spec = g_param_spec_pointer("codec", "Codec", - "The raw arrow::util::Codec *", + "The raw std::shared_ptr *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_CODEC, spec); @@ -133,7 +136,9 @@ garrow_codec_new(GArrowCompressionType type, auto arrow_type = garrow_compression_type_to_raw(type); auto arrow_codec = arrow::util::Codec::Create(arrow_type); if (garrow::check(error, arrow_codec, "[codec][new]")) { - return garrow_codec_new_raw(arrow_codec.ValueOrDie().release()); + std::shared_ptr arrow_codec_shared = + std::move(*arrow_codec); + return garrow_codec_new_raw(&arrow_codec_shared); } else { return NULL; } @@ -151,7 +156,46 @@ const gchar * garrow_codec_get_name(GArrowCodec *codec) { auto arrow_codec = garrow_codec_get_raw(codec); - return arrow_codec->name(); + if (!arrow_codec) { + return NULL; + } + return arrow_codec->name().c_str(); +} + +/** + * garrow_codec_get_compression_type: + * @codec: A #GArrowCodec. + * + * Returns: The compression type of the codec. + * + * Since: 2.0.0 + */ +GArrowCompressionType +garrow_codec_get_compression_type(GArrowCodec *codec) +{ + auto arrow_codec = garrow_codec_get_raw(codec); + if (!arrow_codec) { + return GARROW_COMPRESSION_TYPE_UNCOMPRESSED; + } + return garrow_compression_type_from_raw(arrow_codec->compression_type()); +} + +/** + * garrow_codec_get_compression_level: + * @codec: A #GArrowCodec. + * + * Returns: The compression level of the codec. + * + * Since: 2.0.0 + */ +gint +garrow_codec_get_compression_level(GArrowCodec *codec) +{ + auto arrow_codec = garrow_codec_get_raw(codec); + if (!arrow_codec) { + return arrow::util::Codec::UseDefaultCompressionLevel(); + } + return arrow_codec->compression_level(); } G_END_DECLS @@ -207,7 +251,7 @@ garrow_compression_type_to_raw(GArrowCompressionType type) } GArrowCodec * -garrow_codec_new_raw(arrow::util::Codec *arrow_codec) +garrow_codec_new_raw(std::shared_ptr *arrow_codec) { auto codec = GARROW_CODEC(g_object_new(GARROW_TYPE_CODEC, "codec", arrow_codec, @@ -215,7 +259,7 @@ garrow_codec_new_raw(arrow::util::Codec *arrow_codec) return codec; } -arrow::util::Codec * +std::shared_ptr garrow_codec_get_raw(GArrowCodec *codec) { auto priv = GARROW_CODEC_GET_PRIVATE(codec); diff --git a/c_glib/arrow-glib/codec.h b/c_glib/arrow-glib/codec.h index 5feab2b7d4d..6e177af9eed 100644 --- a/c_glib/arrow-glib/codec.h +++ b/c_glib/arrow-glib/codec.h @@ -20,6 +20,7 @@ #pragma once #include +#include G_BEGIN_DECLS @@ -63,5 +64,11 @@ GArrowCodec *garrow_codec_new(GArrowCompressionType type, GError **error); const gchar *garrow_codec_get_name(GArrowCodec *codec); +GARROW_AVAILABLE_IN_2_0 +GArrowCompressionType +garrow_codec_get_compression_type(GArrowCodec *codec); +GARROW_AVAILABLE_IN_2_0 +gint +garrow_codec_get_compression_level(GArrowCodec *codec); G_END_DECLS diff --git a/c_glib/arrow-glib/codec.hpp b/c_glib/arrow-glib/codec.hpp index 14c3ad77ccf..f4cfaba18a0 100644 --- a/c_glib/arrow-glib/codec.hpp +++ b/c_glib/arrow-glib/codec.hpp @@ -29,6 +29,6 @@ arrow::Compression::type garrow_compression_type_to_raw(GArrowCompressionType type); GArrowCodec * -garrow_codec_new_raw(arrow::util::Codec *arrow_codec); -arrow::util::Codec * +garrow_codec_new_raw(std::shared_ptr *arrow_codec); +std::shared_ptr garrow_codec_get_raw(GArrowCodec *codec); diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index 14dda373575..688c548bf2f 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -140,6 +140,53 @@ garrow_base_list_array_get_value(GArrowArray *array, "parent", array, NULL); }; + +template +GArrowArray * +garrow_base_list_array_get_values(GArrowArray *array) +{ + auto arrow_array = garrow_array_get_raw(array); + auto arrow_list_array = + std::static_pointer_cast(arrow_array); + auto arrow_values = arrow_list_array->values(); + return garrow_array_new_raw(&arrow_values, + "array", &arrow_values, + "parent", array, + NULL); +}; + +template +typename LIST_ARRAY_CLASS::offset_type +garrow_base_list_array_get_value_offset(GArrowArray *array, gint64 i) +{ + auto arrow_array = garrow_array_get_raw(array); + auto arrow_list_array = + std::static_pointer_cast(arrow_array); + return arrow_list_array->value_offset(i); +}; + +template +typename LIST_ARRAY_CLASS::offset_type +garrow_base_list_array_get_value_length(GArrowArray *array, gint64 i) +{ + auto arrow_array = garrow_array_get_raw(array); + auto arrow_list_array = + std::static_pointer_cast(arrow_array); + return arrow_list_array->value_length(i); +}; + +template +const typename LIST_ARRAY_CLASS::offset_type * +garrow_base_list_array_get_value_offsets(GArrowArray *array, gint64 *n_offsets) +{ + auto arrow_array = garrow_array_get_raw(array); + *n_offsets = arrow_array->length() + 1; + auto arrow_list_array = + std::static_pointer_cast(arrow_array); + return arrow_list_array->raw_value_offsets(); +}; + + G_BEGIN_DECLS static void @@ -279,6 +326,70 @@ garrow_list_array_get_value(GArrowListArray *array, GARROW_ARRAY(array), i); } +/** + * garrow_list_array_get_values: + * @array: A #GArrowListArray. + * + * Returns: (transfer full): The array containing the list's values. + * + * Since: 2.0.0 + */ +GArrowArray * +garrow_list_array_get_values(GArrowListArray *array) +{ + return garrow_base_list_array_get_values( + GARROW_ARRAY(array)); +} + +/** + * garrow_list_array_get_offset: + * @array: A #GArrowListArray. + * @i: The index of the offset of the target value. + * + * Returns: The target offset in the array containing the list's values. + * + * Since: 2.0.0 + */ +gint32 +garrow_list_array_get_value_offset(GArrowListArray *array, gint64 i) +{ + return garrow_base_list_array_get_value_offset( + GARROW_ARRAY(array), i); +} + +/** + * garrow_list_array_get_value_length: + * @array: A #GArrowListArray. + * @i: The index of the length of the target value. + * + * Returns: The target length in the array containing the list's values. + * + * Since: 2.0.0 + */ +gint32 +garrow_list_array_get_value_length(GArrowListArray *array, gint64 i) +{ + return garrow_base_list_array_get_value_length( + GARROW_ARRAY(array), i); +} + +/** + * garrow_list_array_get_value_offsets: + * @array: A #GArrowListArray. + * @n_offsets: The number of offsets to be returned. + * + * Returns: (array length=n_offsets): The target offsets in the array + * containing the list's values. + * + * Since: 2.0.0 + */ +const gint32 * +garrow_list_array_get_value_offsets(GArrowListArray *array, gint64 *n_offsets) +{ + return garrow_base_list_array_get_value_offsets( + GARROW_ARRAY(array), n_offsets); +} + typedef struct GArrowLargeListArrayPrivate_ { GArrowArray *raw_values; @@ -434,6 +545,71 @@ garrow_large_list_array_get_value(GArrowLargeListArray *array, i); } +/** + * garrow_large_list_array_get_values: + * @array: A #GArrowLargeListArray. + * + * Returns: (transfer full): The array containing the list's values. + * + * Since: 2.0.0 + */ +GArrowArray * +garrow_large_list_array_get_values(GArrowLargeListArray *array) +{ + return garrow_base_list_array_get_values( + GARROW_ARRAY(array)); +} + +/** + * garrow_large_list_array_get_value_offset: + * @array: A #GArrowLargeListArray. + * @i: The index of the offset of the target value. + * + * Returns: The target offset in the array containing the list's values. + * + * Since: 2.0.0 + */ +gint64 +garrow_large_list_array_get_value_offset(GArrowLargeListArray *array, gint64 i) +{ + return garrow_base_list_array_get_value_offset( + GARROW_ARRAY(array), i); +} + +/** + * garrow_large_list_array_get_length: + * @array: A #GArrowLargeListArray. + * @i: The index of the length of the target value. + * + * Returns: The target length in the array containing the list's values. + * + * Since: 2.0.0 + */ +gint64 +garrow_large_list_array_get_value_length(GArrowLargeListArray *array, gint64 i) +{ + return garrow_base_list_array_get_value_length( + GARROW_ARRAY(array), i); +} + +/** + * garrow_large_list_array_get_value_offsets: + * @array: A #GArrowLargeListArray. + * @n_offsets: The number of offsets to be returned. + * + * Returns: (array length=n_offsets): The target offsets in the array + * containing the list's values. + * + * Since: 2.0.0 + */ +const gint64 * +garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, + gint64 *n_offsets) +{ + return garrow_base_list_array_get_value_offsets( + GARROW_ARRAY(array), n_offsets); +} + typedef struct GArrowStructArrayPrivate_ { GPtrArray *fields; diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index dd0c72668f9..cfaeb4c768c 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -47,6 +47,18 @@ GArrowListArray *garrow_list_array_new(GArrowDataType *data_type, GArrowDataType *garrow_list_array_get_value_type(GArrowListArray *array); GArrowArray *garrow_list_array_get_value(GArrowListArray *array, gint64 i); +GARROW_AVAILABLE_IN_2_0 +GArrowArray *garrow_list_array_get_values(GArrowListArray *array); +GARROW_AVAILABLE_IN_2_0 +gint32 garrow_list_array_get_value_offset(GArrowListArray *array, + gint64 i); +GARROW_AVAILABLE_IN_2_0 +gint32 garrow_list_array_get_value_length(GArrowListArray *array, + gint64 i); +GARROW_AVAILABLE_IN_2_0 +const gint32 * +garrow_list_array_get_value_offsets(GArrowListArray *array, + gint64 *n_offsets); #define GARROW_TYPE_LARGE_LIST_ARRAY (garrow_large_list_array_get_type()) @@ -73,6 +85,18 @@ GArrowDataType *garrow_large_list_array_get_value_type(GArrowLargeListArray *arr GARROW_AVAILABLE_IN_0_16 GArrowArray *garrow_large_list_array_get_value(GArrowLargeListArray *array, gint64 i); +GARROW_AVAILABLE_IN_2_0 +GArrowArray *garrow_large_list_array_get_values(GArrowLargeListArray *array); +GARROW_AVAILABLE_IN_2_0 +gint64 garrow_large_list_array_get_value_offset(GArrowLargeListArray *array, + gint64 i); +GARROW_AVAILABLE_IN_2_0 +gint64 garrow_large_list_array_get_value_length(GArrowLargeListArray *array, + gint64 i); +GARROW_AVAILABLE_IN_2_0 +const gint64 * +garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, + gint64 *n_offsets); #define GARROW_TYPE_STRUCT_ARRAY (garrow_struct_array_get_type()) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 20d910e3250..777adee41a5 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -300,7 +300,7 @@ garrow_function_find(const gchar *name) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable) (transfer full): - * A return value of the execution as #GArrowData on success, %NULL on error. + * A return value of the execution as #GArrowDatum on success, %NULL on error. * * Since: 1.0.0 */ diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index 3751d41ad3a..84904b74265 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -1132,7 +1132,7 @@ garrow_compressed_input_stream_new(GArrowCodec *codec, GArrowInputStream *raw, GError **error) { - auto arrow_codec = garrow_codec_get_raw(codec); + auto arrow_codec = garrow_codec_get_raw(codec).get(); auto arrow_raw = garrow_input_stream_get_raw(raw); auto arrow_stream = arrow::io::CompressedInputStream::Make(arrow_codec, arrow_raw); diff --git a/c_glib/arrow-glib/ipc-options.cpp b/c_glib/arrow-glib/ipc-options.cpp index 1cddd25bb6d..b9b2c414348 100644 --- a/c_glib/arrow-glib/ipc-options.cpp +++ b/c_glib/arrow-glib/ipc-options.cpp @@ -21,6 +21,7 @@ # include #endif +#include #include #include @@ -242,6 +243,7 @@ garrow_read_options_set_included_fields(GArrowReadOptions *options, typedef struct GArrowWriteOptionsPrivate_ { arrow::ipc::IpcWriteOptions options; + GArrowCodec *codec; } GArrowWriteOptionsPrivate; enum { @@ -249,8 +251,7 @@ enum { PROP_WRITE_OPTIONS_MAX_RECURSION_DEPTH, PROP_WRITE_OPTIONS_ALIGNMENT, PROP_WRITE_OPTIONS_WRITE_LEGACY_IPC_FORMAT, - PROP_WRITE_OPTIONS_COMPRESSION, - PROP_WRITE_OPTIONS_COMPRESSION_LEVEL, + PROP_WRITE_OPTIONS_CODEC, PROP_WRITE_OPTIONS_USE_THREADS, }; @@ -263,6 +264,19 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowWriteOptions, garrow_write_options_get_instance_private( \ GARROW_WRITE_OPTIONS(obj))) +static void +garrow_write_options_dispose(GObject *object) +{ + auto priv = GARROW_WRITE_OPTIONS_GET_PRIVATE(object); + + if (priv->codec) { + g_object_unref(priv->codec); + priv->codec = NULL; + } + + G_OBJECT_CLASS(garrow_write_options_parent_class)->dispose(object); +} + static void garrow_write_options_finalize(GObject *object) { @@ -294,12 +308,12 @@ garrow_write_options_set_property(GObject *object, case PROP_WRITE_OPTIONS_WRITE_LEGACY_IPC_FORMAT: priv->options.write_legacy_ipc_format = g_value_get_boolean(value); break; - case PROP_WRITE_OPTIONS_COMPRESSION: - priv->options.compression = - static_cast(g_value_get_enum(value)); - break; - case PROP_WRITE_OPTIONS_COMPRESSION_LEVEL: - priv->options.compression_level = g_value_get_int(value); + case PROP_WRITE_OPTIONS_CODEC: + if (priv->codec) { + g_object_unref(priv->codec); + } + priv->codec = GARROW_CODEC(g_value_dup_object(value)); + priv->options.codec = garrow_codec_get_raw(priv->codec); break; case PROP_WRITE_OPTIONS_USE_THREADS: priv->options.use_threads = g_value_get_boolean(value); @@ -331,11 +345,8 @@ garrow_write_options_get_property(GObject *object, case PROP_WRITE_OPTIONS_WRITE_LEGACY_IPC_FORMAT: g_value_set_boolean(value, priv->options.write_legacy_ipc_format); break; - case PROP_WRITE_OPTIONS_COMPRESSION: - g_value_set_enum(value, priv->options.compression); - break; - case PROP_WRITE_OPTIONS_COMPRESSION_LEVEL: - g_value_set_int(value, priv->options.compression_level); + case PROP_WRITE_OPTIONS_CODEC: + g_value_set_object(value, priv->codec); break; case PROP_WRITE_OPTIONS_USE_THREADS: g_value_set_boolean(value, priv->options.use_threads); @@ -352,6 +363,11 @@ garrow_write_options_init(GArrowWriteOptions *object) auto priv = GARROW_WRITE_OPTIONS_GET_PRIVATE(object); new(&priv->options) arrow::ipc::IpcWriteOptions; priv->options = arrow::ipc::IpcWriteOptions::Defaults(); + if (priv->options.codec) { + priv->codec = garrow_codec_new_raw(&(priv->options.codec)); + } else { + priv->codec = NULL; + } } static void @@ -359,6 +375,7 @@ garrow_write_options_class_init(GArrowWriteOptionsClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_write_options_dispose; gobject_class->finalize = garrow_write_options_finalize; gobject_class->set_property = garrow_write_options_set_property; gobject_class->get_property = garrow_write_options_get_property; @@ -441,42 +458,24 @@ garrow_write_options_class_init(GArrowWriteOptionsClass *klass) spec); /** - * GArrowWriteOptions:compression: + * GArrowWriteOptions:codec: * * Codec to use for compressing and decompressing record batch body * buffers. This is not part of the Arrow IPC protocol and only for - * internal use (e.g. Feather files). May only be LZ4_FRAME and - * ZSTD. + * internal use (e.g. Feather files). * - * Since: 1.0.0 - */ - spec = g_param_spec_enum("compression", - "Compression", - "Codec to use for " - "compressing record batch body buffers.", - GARROW_TYPE_COMPRESSION_TYPE, - options.compression, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, - PROP_WRITE_OPTIONS_COMPRESSION, - spec); - - /** - * GArrowWriteOptions:compression-level: - * - * The level for compression. + * May only be UNCOMPRESSED, LZ4_FRAME and ZSTD. * - * Since: 1.0.0 + * Since: 2.0.0 */ - spec = g_param_spec_int("compression-level", - "Compression level", - "The level for compression", - G_MININT, - G_MAXINT, - options.compression_level, - static_cast(G_PARAM_READWRITE)); + spec = g_param_spec_object("codec", + "Codec", + "Codec to use for " + "compressing record batch body buffers.", + GARROW_TYPE_CODEC, + static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, - PROP_WRITE_OPTIONS_COMPRESSION_LEVEL, + PROP_WRITE_OPTIONS_CODEC, spec); /** diff --git a/c_glib/arrow-glib/output-stream.cpp b/c_glib/arrow-glib/output-stream.cpp index 2c3ccafdb13..1619bac45d4 100644 --- a/c_glib/arrow-glib/output-stream.cpp +++ b/c_glib/arrow-glib/output-stream.cpp @@ -688,7 +688,7 @@ garrow_compressed_output_stream_new(GArrowCodec *codec, GArrowOutputStream *raw, GError **error) { - auto arrow_codec = garrow_codec_get_raw(codec); + auto arrow_codec = garrow_codec_get_raw(codec).get(); auto arrow_raw = garrow_output_stream_get_raw(raw); auto arrow_stream = arrow::io::CompressedOutputStream::Make(arrow_codec, arrow_raw); diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 51ad3ac444d..c3082271ca5 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -332,7 +332,7 @@ garrow_record_batch_file_reader_finalize(GObject *object) { auto priv = GARROW_RECORD_BATCH_FILE_READER_GET_PRIVATE(object); - priv->record_batch_file_reader = nullptr; + priv->record_batch_file_reader.~shared_ptr(); G_OBJECT_CLASS(garrow_record_batch_file_reader_parent_class)->finalize(object); } @@ -372,6 +372,9 @@ garrow_record_batch_file_reader_get_property(GObject *object, static void garrow_record_batch_file_reader_init(GArrowRecordBatchFileReader *object) { + auto priv = GARROW_RECORD_BATCH_FILE_READER_GET_PRIVATE(object); + new(&priv->record_batch_file_reader) + std::shared_ptr; } static void @@ -1181,7 +1184,7 @@ garrow_csv_read_options_add_schema(GArrowCSVReadOptions *options, { auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options); auto arrow_schema = garrow_schema_get_raw(schema); - for (const auto field : arrow_schema->fields()) { + for (const auto &field : arrow_schema->fields()) { priv->convert_options.column_types[field->name()] = field->type(); } } @@ -1203,7 +1206,7 @@ garrow_csv_read_options_get_column_types(GArrowCSVReadOptions *options) g_str_equal, g_free, g_object_unref); - for (const auto iter : priv->convert_options.column_types) { + for (const auto &iter : priv->convert_options.column_types) { auto arrow_name = iter.first; auto arrow_data_type = iter.second; g_hash_table_insert(types, diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 074c83af120..82d18e58dbd 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -235,10 +235,10 @@ garrow_record_batch_stream_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error) { - auto arrow_sink = garrow_output_stream_get_raw(sink).get(); + auto arrow_sink = garrow_output_stream_get_raw(sink); auto arrow_schema = garrow_schema_get_raw(schema); auto arrow_writer_result = - arrow::ipc::NewStreamWriter(arrow_sink, arrow_schema); + arrow::ipc::MakeStreamWriter(arrow_sink, arrow_schema); if (garrow::check(error, arrow_writer_result, "[record-batch-stream-writer][open]")) { @@ -280,11 +280,11 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error) { - auto arrow_sink = garrow_output_stream_get_raw(sink).get(); + auto arrow_sink = garrow_output_stream_get_raw(sink); auto arrow_schema = garrow_schema_get_raw(schema); std::shared_ptr arrow_writer; auto arrow_writer_result = - arrow::ipc::NewFileWriter(arrow_sink, arrow_schema); + arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema); if (garrow::check(error, arrow_writer_result, "[record-batch-file-writer][open]")) { diff --git a/c_glib/configure.ac b/c_glib/configure.ac index 5919ec95d7c..dc0624e1630 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -17,7 +17,7 @@ AC_PREREQ(2.65) -m4_define([arrow_glib_version], 2.0.0-SNAPSHOT) +m4_define([arrow_glib_version], 3.0.0-SNAPSHOT) AC_INIT([arrow-glib], arrow_glib_version, [https://issues.apache.org/jira/browse/ARROW], diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 57a53c2eec7..72a01f50e4f 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -179,6 +179,10 @@ Index of deprecated API + + Index of new symbols in 2.0.0 + + Index of new symbols in 1.0.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 7b8b21c7b20..e6a4a9b6671 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -version = '2.0.0-SNAPSHOT' +version = '3.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb index c5657fe7e17..1f5b77f2e9f 100644 --- a/c_glib/test/dataset/test-scan-options.rb +++ b/c_glib/test/dataset/test-scan-options.rb @@ -28,7 +28,7 @@ def test_schema end def test_batch_size - assert_equal(1<<15, + assert_equal(1<<20, @scan_options.batch_size) @scan_options.batch_size = 42 assert_equal(42, diff --git a/c_glib/test/test-codec.rb b/c_glib/test/test-codec.rb index 6617815df9b..a32ec4dc757 100644 --- a/c_glib/test/test-codec.rb +++ b/c_glib/test/test-codec.rb @@ -20,4 +20,14 @@ def test_name codec = Arrow::Codec.new(:gzip) assert_equal("gzip", codec.name) end + + def test_compression_type + codec = Arrow::Codec.new(:gzip) + assert_equal(Arrow::CompressionType::GZIP, codec.compression_type) + end + + def test_compression_level + codec = Arrow::Codec.new(:gzip) + assert_equal(9, codec.compression_level) + end end diff --git a/c_glib/test/test-decimal128.rb b/c_glib/test/test-decimal128.rb index 0e4bc8264d5..98789d3812e 100644 --- a/c_glib/test/test-decimal128.rb +++ b/c_glib/test/test-decimal128.rb @@ -214,7 +214,7 @@ def test_rescale_fail decimal = Arrow::Decimal128.new(10) message = "[decimal128][rescale]: Invalid: " + - "Rescaling decimal value would cause data loss" + "Rescaling Decimal128 value would cause data loss" assert_raise(Arrow::Error::Invalid.new(message)) do decimal.rescale(1, -1) end diff --git a/c_glib/test/test-large-list-array.rb b/c_glib/test/test-large-list-array.rb index 9840989ab89..2f7efab5a07 100644 --- a/c_glib/test/test-large-list-array.rb +++ b/c_glib/test/test-large-list-array.rb @@ -36,21 +36,11 @@ def test_new end def test_value - field = Arrow::Field.new("value", Arrow::Int64DataType.new) - data_type = Arrow::LargeListDataType.new(field) - builder = Arrow::LargeListArrayBuilder.new(data_type) - value_builder = builder.value_builder - - builder.append_value - value_builder.append_value(-29) - value_builder.append_value(29) - - builder.append_value - value_builder.append_value(-1) - value_builder.append_value(0) - value_builder.append_value(1) - - array = builder.finish + array = build_large_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) value = array.get_value(1) assert_equal([-1, 0, 1], value.length.times.collect {|i| value.get_value(i)}) @@ -63,4 +53,46 @@ def test_value_type array = builder.finish assert_equal(Arrow::Int64DataType.new, array.value_type) end + + + def test_values + array = build_large_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + values = array.values + assert_equal([-29, 29, -1, 0, 1], + values.length.times.collect {|i| values.get_value(i)}) + end + + def test_value_offset + array = build_large_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([0, 2], + array.length.times.collect {|i| array.get_value_offset(i)}) + end + + def test_value_length + array = build_large_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([2, 3], + array.length.times.collect {|i| array.get_value_length(i)}) + end + + def test_value_offsets + array = build_large_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([0, 2, 5], + array.value_offsets) + end end diff --git a/c_glib/test/test-list-array.rb b/c_glib/test/test-list-array.rb index eea16ccbbb3..f94b28dd1cd 100644 --- a/c_glib/test/test-list-array.rb +++ b/c_glib/test/test-list-array.rb @@ -36,21 +36,11 @@ def test_new end def test_value - field = Arrow::Field.new("value", Arrow::Int8DataType.new) - data_type = Arrow::ListDataType.new(field) - builder = Arrow::ListArrayBuilder.new(data_type) - value_builder = builder.value_builder - - builder.append_value - value_builder.append_value(-29) - value_builder.append_value(29) - - builder.append_value - value_builder.append_value(-1) - value_builder.append_value(0) - value_builder.append_value(1) - - array = builder.finish + array = build_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) value = array.get_value(1) assert_equal([-1, 0, 1], value.length.times.collect {|i| value.get_value(i)}) @@ -63,4 +53,45 @@ def test_value_type array = builder.finish assert_equal(Arrow::Int8DataType.new, array.value_type) end + + def test_values + array = build_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + values = array.values + assert_equal([-29, 29, -1, 0, 1], + values.length.times.collect {|i| values.get_value(i)}) + end + + def test_value_offset + array = build_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([0, 2], + array.length.times.collect {|i| array.get_value_offset(i)}) + end + + def test_value_length + array = build_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([2, 3], + array.length.times.collect {|i| array.get_value_length(i)}) + end + + def test_value_offsets + array = build_list_array(Arrow::Int8DataType.new, + [ + [-29, 29], + [-1, 0, 1], + ]) + assert_equal([0, 2, 5], + array.value_offsets) + end end diff --git a/c_glib/test/test-write-options.rb b/c_glib/test/test-write-options.rb index d30b78b9cdb..c528ce673d4 100644 --- a/c_glib/test/test-write-options.rb +++ b/c_glib/test/test-write-options.rb @@ -73,27 +73,15 @@ def test_accessor end end - sub_test_case("compression") do + sub_test_case("codec") do def test_default - assert_equal(Arrow::CompressionType::UNCOMPRESSED, - @options.compression) + assert_nil(@options.codec) end def test_accessor - @options.compression = :zstd - assert_equal(Arrow::CompressionType::ZSTD, - @options.compression) - end - end - - sub_test_case("compression-level") do - def test_default - assert_equal(-(2 ** 31), @options.compression_level) - end - - def test_accessor - @options.compression_level = 8 - assert_equal(8, @options.compression_level) + @options.codec = Arrow::Codec.new(:zstd) + assert_equal("zstd", + @options.codec.name) end end diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index a2ce0765617..6b930939660 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -89,7 +89,7 @@ pushd cpp\build @rem and enable runtime assertions. cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ - -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BOOST_USE_SHARED=ON ^ -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=ON ^ diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 14bc0fd7a21..616232d202c 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -62,10 +62,10 @@ if "%JOB%" NEQ "Build_Debug" ( --file=ci\conda_env_python.yml ^ %CONDA_PACKAGES% ^ "cmake=3.17" ^ - "boost-cpp<1.70" ^ "ninja" ^ "nomkl" ^ "pandas" ^ + "fsspec" ^ "python=%PYTHON%" ^ || exit /B ) diff --git a/ci/conda_env_archery.yml b/ci/conda_env_archery.yml index 3eb8003e274..81c314eb4b2 100644 --- a/ci/conda_env_archery.yml +++ b/ci/conda_env_archery.yml @@ -16,5 +16,7 @@ # under the License. click +gitpython pygithub ruamel.yaml +semver diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index a0c2e99aca7..90cef3ea2d1 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -25,7 +25,7 @@ cmake gflags glog gmock>=1.8.1 -grpc-cpp>=1.21.4 +grpc-cpp>=1.27.3 gtest=1.8.1 libprotobuf libutf8proc diff --git a/ci/conda_env_gandiva.yml b/ci/conda_env_gandiva.yml index fa15d77df83..5056456fc66 100644 --- a/ci/conda_env_gandiva.yml +++ b/ci/conda_env_gandiva.yml @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -clangdev=10 -llvmdev=10 +clangdev=11 +llvmdev=11 re2 diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index f6c89923870..f2f46c84436 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,6 +16,7 @@ # under the License. # don't add pandas here, because it is not a mandatory test dependency +boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture cffi cython cloudpickle @@ -26,5 +27,6 @@ pytest pytest-faulthandler pytest-lazy-fixture pytz +s3fs>=0.4 setuptools setuptools_scm diff --git a/ci/conda_env_r.yml b/ci/conda_env_r.yml index b9a13fdbe33..03d5f3b625c 100644 --- a/ci/conda_env_r.yml +++ b/ci/conda_env_r.yml @@ -21,7 +21,7 @@ r-bit64 r-dplyr r-purrr r-r6 -r-rcpp >=1.0.1 +r-cpp11 r-rlang r-tidyselect r-vctrs diff --git a/ci/conda_env_sphinx.yml b/ci/conda_env_sphinx.yml index 318ef75e46c..8654d231065 100644 --- a/ci/conda_env_sphinx.yml +++ b/ci/conda_env_sphinx.yml @@ -19,6 +19,6 @@ breathe doxygen ipython -# Pinned per ARROW-8340 -sphinx=2.4.4 +# Pinned per ARROW-9693 +sphinx=3.1.2 sphinx_rtd_theme diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 088d93e3921..e6e5ac859dd 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -21,7 +21,7 @@ FROM ${repo}:${arch}-conda-cpp ARG arch=amd64 ARG maven=3.5 -ARG node=11 +ARG node=14 ARG jdk=8 ARG go=1.12 diff --git a/ci/docker/conda-python-kartothek.dockerfile b/ci/docker/conda-python-kartothek.dockerfile index aa013fe5f3d..d523161822c 100644 --- a/ci/docker/conda-python-kartothek.dockerfile +++ b/ci/docker/conda-python-kartothek.dockerfile @@ -22,11 +22,17 @@ FROM ${repo}:${arch}-conda-python-${python} # install kartothek dependencies from conda-forge RUN conda install -c conda-forge -q \ + attrs \ + click \ + cloudpickle \ dask \ decorator \ + freezegun \ msgpack-python \ + prompt-toolkit \ pytest-mock \ pytest-xdist \ + pyyaml \ simplejson \ simplekv \ storefact \ diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index a20f1ff3521..d3f0a224582 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -36,10 +36,6 @@ ARG spark=master COPY ci/scripts/install_spark.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark -# patch spark to build with current Arrow Java -COPY ci/etc/integration_spark_ARROW-9438.patch /arrow/ci/etc/ -RUN patch -d /spark -p1 -i /arrow/ci/etc/integration_spark_ARROW-9438.patch - # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/ci/docker/conda-r.dockerfile b/ci/docker/conda-r.dockerfile deleted file mode 100644 index 79b6ebc3b2d..00000000000 --- a/ci/docker/conda-r.dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG repo -ARG arch -FROM ${repo}:${arch}-conda-cpp - -# Need locales so we can set UTF-8 -RUN apt-get update -y && \ - apt-get install -y locales && \ - locale-gen en_US.UTF-8 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# install R specific packages -ARG r=3.6.1 -COPY ci/conda_env_r.yml /arrow/ci/ -RUN conda install -q \ - --file arrow/ci/conda_env_r.yml \ - r-base=$r \ - nomkl && \ - conda clean --all - -# Ensure parallel compilation of of C/C++ code -RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $CONDA_PREFIX/lib/R/etc/Makeconf - -ENV ARROW_BUILD_STATIC=OFF \ - ARROW_BUILD_TESTS=OFF \ - ARROW_BUILD_UTILITIES=OFF \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_FLIGHT=OFF \ - ARROW_GANDIVA=OFF \ - ARROW_NO_DEPRECATED_API=ON \ - ARROW_ORC=OFF \ - ARROW_PARQUET=ON \ - ARROW_PLASMA=OFF \ - ARROW_USE_CCACHE=ON \ - ARROW_USE_GLOG=OFF \ - LC_ALL=en_US.UTF-8 diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index f86c009b57b..74143dcbfa4 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -17,6 +17,7 @@ ARG arch=amd64 FROM ${arch}/debian:10 +ARG arch ENV DEBIAN_FRONTEND noninteractive @@ -26,7 +27,7 @@ RUN \ ARG llvm RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ + apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ @@ -49,6 +50,7 @@ RUN apt-get update -y -q && \ libbrotli-dev \ libbz2-dev \ libc-ares-dev \ + libcurl4-openssl-dev \ libgflags-dev \ libgmock-dev \ libgoogle-glog-dev \ @@ -71,6 +73,10 @@ RUN apt-get update -y -q && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ @@ -80,6 +86,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_PLASMA=ON \ + ARROW_S3=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ @@ -87,6 +94,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ cares_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ diff --git a/ci/docker/debian-10-js.dockerfile b/ci/docker/debian-10-js.dockerfile index b43c45abdf4..5bb31f2e32e 100644 --- a/ci/docker/debian-10-js.dockerfile +++ b/ci/docker/debian-10-js.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch=amd64 -ARG node=11 +ARG node=14 FROM ${arch}/node:${node} ENV NODE_NO_WARNINGS=1 diff --git a/ci/docker/debian-10-rust.dockerfile b/ci/docker/debian-10-rust.dockerfile index b23b03c9a1c..9c9c9b51048 100644 --- a/ci/docker/debian-10-rust.dockerfile +++ b/ci/docker/debian-10-rust.dockerfile @@ -58,14 +58,18 @@ RUN mkdir \ /arrow/rust/benchmarks/src \ /arrow/rust/datafusion/src \ /arrow/rust/integration-testing/src \ - /arrow/rust/parquet/src && \ + /arrow/rust/parquet/src \ + /arrow/rust/parquet_derive/src \ + /arrow/rust/parquet_derive_test/src && \ touch \ /arrow/rust/arrow-flight/src/lib.rs \ /arrow/rust/arrow/src/lib.rs \ /arrow/rust/benchmarks/src/lib.rs \ /arrow/rust/datafusion/src/lib.rs \ /arrow/rust/integration-testing/src/lib.rs \ - /arrow/rust/parquet/src/lib.rs + /arrow/rust/parquet/src/lib.rs \ + /arrow/rust/parquet_derive/src/lib.rs \ + /arrow/rust/parquet_derive_test/src/lib.rs # Compile dependencies for the whole workspace RUN cd /arrow/rust && cargo build --workspace --lib --all-features diff --git a/ci/docker/fedora-32-cpp.dockerfile b/ci/docker/fedora-33-cpp.dockerfile similarity index 84% rename from ci/docker/fedora-32-cpp.dockerfile rename to ci/docker/fedora-33-cpp.dockerfile index 535f8b4b761..9dde6999510 100644 --- a/ci/docker/fedora-32-cpp.dockerfile +++ b/ci/docker/fedora-33-cpp.dockerfile @@ -16,11 +16,12 @@ # under the License. ARG arch -FROM ${arch}/fedora:32 +FROM ${arch}/fedora:33 +ARG arch # install dependencies RUN dnf update -y && \ - dnf install -y \ + dnf install -y \ autoconf \ boost-devel \ brotli-devel \ @@ -29,18 +30,20 @@ RUN dnf update -y && \ ccache \ clang-devel \ cmake \ + curl-devel \ flatbuffers-devel \ - java-1.8.0-openjdk-devel \ - java-1.8.0-openjdk-headless \ gcc \ gcc-c++ \ - glog-devel \ gflags-devel \ + git \ + glog-devel \ gmock-devel \ google-benchmark-devel \ - protobuf-devel \ + grpc-devel \ + grpc-plugins \ gtest-devel \ - git \ + java-latest-openjdk-devel \ + java-latest-openjdk-headless \ libzstd-devel \ llvm-devel \ llvm-static \ @@ -48,16 +51,21 @@ RUN dnf update -y && \ make \ ninja-build \ openssl-devel \ + protobuf-devel \ python \ rapidjson-devel \ re2-devel \ snappy-devel \ thrift-devel \ utf8proc-devel \ + wget \ which \ zlib-devel -# * gRPC 1.26 in Fedora 32 may have a problem. arrow-flight-test is stuck. +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ @@ -67,6 +75,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_HOME=/usr/local \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ + ARROW_S3=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ @@ -74,9 +83,9 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ - gRPC_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXECUTABLES=ON \ PARQUET_BUILD_EXAMPLES=ON \ diff --git a/ci/docker/linux-apt-c-glib.dockerfile b/ci/docker/linux-apt-c-glib.dockerfile index 3d1658ff7b8..12c6e23a00d 100644 --- a/ci/docker/linux-apt-c-glib.dockerfile +++ b/ci/docker/linux-apt-c-glib.dockerfile @@ -25,6 +25,7 @@ RUN apt-get update -y -q && \ gtk-doc-tools \ libgirepository1.0-dev \ libglib2.0-doc \ + lsb-release \ luarocks \ pkg-config \ ruby-dev && \ @@ -54,7 +55,7 @@ RUN luarocks install lgi # ERROR: Command errored out with exit status 1: /usr/bin/python3 /usr/share/python-wheels/pep517-0.7.0-py2.py3-none-any.whl/pep517/_in_process.py get_requires_for_build_wheel /tmp/tmpsk4jveay Check the logs for full command output. RUN (python3 -m pip install meson || \ python3 -m pip install --no-use-pep517 meson) && \ - gem install bundler + gem install --no-document bundler COPY c_glib/Gemfile /arrow/c_glib/ RUN bundle install --gemfile /arrow/c_glib/Gemfile diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index d0d98d5cd30..ec474f99861 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -38,8 +38,11 @@ RUN apt-get update -y && \ gobject-introspection \ gtk-doc-tools \ libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfribidi-dev \ libgirepository1.0-dev \ libglib2.0-doc \ + libharfbuzz-dev \ libtool \ libxml2-dev \ ninja-build \ @@ -62,23 +65,31 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=11 +ARG node=14 RUN wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ apt-get install -y nodejs && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Sphinx is pinned because of ARROW-9693 RUN pip install \ meson \ breathe \ ipython \ - sphinx \ + sphinx==3.1.2 \ sphinx_rtd_theme COPY c_glib/Gemfile /arrow/c_glib/ -RUN gem install bundler && \ +RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile +# Ensure parallel R package installation, set CRAN repo mirror, +# and use pre-built binaries where possible +COPY ci/etc/rprofile /arrow/ci/etc/ +RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site +# Also ensure parallel compilation of C/C++ code +RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Makeconf + COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 85827358dfb..f47044e334b 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -17,6 +17,7 @@ ARG base FROM ${base} +ARG arch # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 @@ -70,6 +71,10 @@ COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + # Set up Python 3 and its dependencies RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ ln -s /usr/bin/pip3 /usr/local/bin/pip @@ -89,6 +94,7 @@ ENV \ ARROW_PARQUET=ON \ ARROW_PLASMA=OFF \ ARROW_PYTHON=ON \ + ARROW_S3=ON \ ARROW_USE_CCACHE=ON \ ARROW_USE_GLOG=OFF \ LC_ALL=en_US.UTF-8 diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index 1d963a20d14..5223d7aafa5 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -24,12 +24,16 @@ FROM ${base} ARG r_bin=R ENV R_BIN=${r_bin} +ARG r_dev=FALSE +ENV ARROW_R_DEV=${r_dev} + # Make sure R is on the path for the R-hub devel versions (where RPREFIX is set in its dockerfile) ENV PATH "${RPREFIX}/bin:${PATH}" # Patch up some of the docker images COPY ci/scripts/r_docker_configure.sh /arrow/ci/scripts/ COPY ci/etc/rprofile /arrow/ci/etc/ +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/r_docker_configure.sh COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile b/ci/docker/ubuntu-18.04-cpp.dockerfile index a0fe1b3f6be..bfff20b441c 100644 --- a/ci/docker/ubuntu-18.04-cpp.dockerfile +++ b/ci/docker/ubuntu-18.04-cpp.dockerfile @@ -70,6 +70,7 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ + libcurl4-openssl-dev \ libgflags-dev \ libgoogle-glog-dev \ liblz4-dev \ @@ -96,6 +97,7 @@ RUN apt-get update -y -q && \ # - libgtest-dev only provide sources # - libprotobuf-dev only provide sources # - thrift is too old +# - s3 tests would require boost-asio that is included since Boost 1.66.0 ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ @@ -117,6 +119,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXECUTABLES=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index ce738f5e554..fbcda444915 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -17,6 +17,7 @@ ARG base=amd64/ubuntu:20.04 FROM ${base} +ARG arch SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -29,7 +30,22 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN apt-get update -y -q && \ +RUN if [ "${llvm}" -gt "10" ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + wget && \ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list && \ + if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ + echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${clang_tools} main" > \ + /etc/apt/sources.list.d/clang-tools.list; \ + fi \ + fi && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ clang-${clang_tools} \ clang-${llvm} \ @@ -57,6 +73,7 @@ RUN apt-get update -y -q && \ libbrotli-dev \ libbz2-dev \ libgflags-dev \ + libcurl4-openssl-dev \ libgoogle-glog-dev \ liblz4-dev \ libprotobuf-dev \ @@ -72,10 +89,15 @@ RUN apt-get update -y -q && \ pkg-config \ protobuf-compiler \ rapidjson-dev \ - tzdata && \ + tzdata \ + wget && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages # provided by the distribution: @@ -95,6 +117,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_PLASMA=ON \ + ARROW_S3=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ ARROW_USE_UBSAN=OFF \ @@ -104,6 +127,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ diff --git a/ci/etc/integration_spark_ARROW-9438.patch b/ci/etc/integration_spark_ARROW-9438.patch deleted file mode 100644 index 2baed303717..00000000000 --- a/ci/etc/integration_spark_ARROW-9438.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 0b5388a945a7e5c5706cf00d0754540a6c68254d Mon Sep 17 00:00:00 2001 -From: Bryan Cutler -Date: Mon, 13 Jul 2020 23:12:25 -0700 -Subject: [PATCH] Update Arrow Java for 1.0.0 - ---- - pom.xml | 17 ++++++++++++++--- - sql/catalyst/pom.xml | 4 ++++ - 2 files changed, 18 insertions(+), 3 deletions(-) - -diff --git a/pom.xml b/pom.xml -index 08ca13bfe9..6619fca200 100644 ---- a/pom.xml -+++ b/pom.xml -@@ -199,7 +199,7 @@ - If you are changing Arrow version specification, please check ./python/pyspark/sql/utils.py, - and ./python/setup.py too. - --> -- 0.15.1 -+ 1.0.0-SNAPSHOT - - org.fusesource.leveldbjni - -@@ -2288,7 +2288,7 @@ - - - com.fasterxml.jackson.core -- jackson-databind -+ jackson-core - - - io.netty -@@ -2298,9 +2298,20 @@ - io.netty - netty-common - -+ -+ -+ -+ org.apache.arrow -+ arrow-memory-netty -+ ${arrow.version} -+ - - io.netty -- netty-handler -+ netty-buffer -+ -+ -+ io.netty -+ netty-common - - - -diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml -index 9edbb7fec9..6b79eb722f 100644 ---- a/sql/catalyst/pom.xml -+++ b/sql/catalyst/pom.xml -@@ -117,6 +117,10 @@ - org.apache.arrow - arrow-vector - -+ -+ org.apache.arrow -+ arrow-memory-netty -+ - - - target/scala-${scala.binary.version}/classes --- -2.17.1 - diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index df5f5860445..2bfbcafbaf7 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,13 +18,14 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=1.0.0.9000 +pkgver=2.0.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") -depends=("${MINGW_PACKAGE_PREFIX}-thrift" +depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" + "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" "${MINGW_PACKAGE_PREFIX}-lz4" @@ -74,6 +75,9 @@ build() { export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH" export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include" export LIBS="-L${MINGW_PREFIX}/libs" + export ARROW_S3=OFF + else + export ARROW_S3=ON fi MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \ @@ -94,6 +98,7 @@ build() { -DARROW_MIMALLOC=ON \ -DARROW_PACKAGE_PREFIX="${MINGW_PREFIX}" \ -DARROW_PARQUET=ON \ + -DARROW_S3="${ARROW_S3}" \ -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_WITH_LZ4=ON \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index aebc02ed2cf..fe109b77b09 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -82,6 +82,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \ -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT:-OFF} \ -DARROW_PLASMA=${ARROW_PLASMA:-OFF} \ -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ + -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ @@ -100,6 +101,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ + -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ -DBOOST_SOURCE=${BOOST_SOURCE:-} \ -DBrotli_SOURCE=${Brotli_SOURCE:-} \ diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 9ed70afc03b..42f7ce040e0 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -20,11 +20,14 @@ set -e declare -A archs -archs=([amd64]=amd64) +archs=([amd64]=amd64 + [arm64v8]=arm64 + [arm32v7]=arm + [s390x]=s390x) declare -A platforms -platforms=([macos]=darwin - [linux]=linux) +platforms=([linux]=linux + [macos]=darwin) arch=${archs[$1]} platform=${platforms[$2]} @@ -34,10 +37,10 @@ prefix=$4 if [ "$#" -ne 4 ]; then echo "Usage: $0 " exit 1 -elif [[ -z ${archs[$1]} ]]; then +elif [[ -z ${arch} ]]; then echo "Unexpected architecture: ${1}" exit 1 -elif [[ -z ${platforms[$2]} ]]; then +elif [[ -z ${platform} ]]; then echo "Unexpected platform: ${2}" exit 1 elif [[ ${version} != "latest" ]]; then @@ -45,5 +48,5 @@ elif [[ ${version} != "latest" ]]; then exit 1 fi -wget -nv -P ${prefix}/bin https://dl.min.io/server/minio/release/linux-${arch}/minio +wget -nv -P ${prefix}/bin https://dl.min.io/server/minio/release/${platform}-${arch}/minio chmod +x ${prefix}/bin/minio diff --git a/ci/scripts/integration_kartothek.sh b/ci/scripts/integration_kartothek.sh index f1465ba40e6..6e89f726339 100755 --- a/ci/scripts/integration_kartothek.sh +++ b/ci/scripts/integration_kartothek.sh @@ -27,4 +27,4 @@ python -c "import pyarrow.parquet" python -c "import kartothek" pushd /kartothek -pytest -n0 +pytest -n0 --ignore tests/cli/test_query.py diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index 9828a28a1ec..a45ed7a7125 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -22,6 +22,9 @@ source_dir=${1} spark_dir=${2} spark_version=${SPARK_VERSION:-master} +# Use old behavior that always dropped tiemzones. +export PYARROW_IGNORE_TIMEZONE=1 + if [ "${SPARK_VERSION:0:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x export ARROW_PRE_0_15_IPC_FORMAT=1 diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index b5643f77840..a2deafa17ba 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -23,6 +23,62 @@ source_dir=${1}/java cpp_build_dir=${2}/cpp/${ARROW_BUILD_TYPE:-debug} with_docs=${3:-false} +if [[ "$(uname -s)" == "Linux" ]] && [[ "$(uname -m)" == "s390x" ]]; then + # Since some files for s390_64 are not available at maven central, + # download pre-build files from bintray and install them explicitly + mvn_install="mvn install:install-file" + wget="wget" + bintray_base_url="https://dl.bintray.com/apache/arrow" + + bintray_dir="flatc-binary" + group="com.github.icexelloss" + artifact="flatc-linux-s390_64" + ver="1.9.0" + extension="exe" + target=${artifact}-${ver}.${extension} + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dpackaging=${extension} -Dfile=$(pwd)/${target} + + bintray_dir="protoc-binary" + group="com.google.protobuf" + artifact="protoc" + ver="3.7.1" + classifier="linux-s390_64" + extension="exe" + target=${artifact}-${ver}-${classifier}.${extension} + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} + # protoc requires libprotoc.so.18 libprotobuf.so.18 + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/libprotoc.so.18 + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/libprotobuf.so.18 + export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(pwd) + + bintray_dir="protoc-gen-grpc-java-binary" + group="io.grpc" + artifact="protoc-gen-grpc-java" + ver="1.30.2" + classifier="linux-s390_64" + extension="exe" + target=${artifact}-${ver}-${classifier}.${extension} + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} + + bintray_dir="netty-binary" + group="io.netty" + artifact="netty-transport-native-unix-common" + ver="4.1.48.Final" + classifier="linux-s390_64" + extension="jar" + target=${artifact}-${ver}-${classifier}.${extension} + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} + artifact="netty-transport-native-epoll" + extension="jar" + target=${artifact}-${ver}-${classifier}.${extension} + ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} +fi + mvn="mvn -B -DskipTests -Drat.skip=true -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" # Use `2 * ncores` threads mvn="${mvn} -T 2C" diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index a2845d1473b..51dc45fc3a2 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -24,25 +24,26 @@ target=$1 packages=() case "${target}" in cpp|c_glib|ruby) - packages+=(make) - packages+=(${MINGW_PACKAGE_PREFIX}-ccache) + packages+=(${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp) packages+=(${MINGW_PACKAGE_PREFIX}-boost) packages+=(${MINGW_PACKAGE_PREFIX}-brotli) + packages+=(${MINGW_PACKAGE_PREFIX}-ccache) packages+=(${MINGW_PACKAGE_PREFIX}-clang) packages+=(${MINGW_PACKAGE_PREFIX}-cmake) packages+=(${MINGW_PACKAGE_PREFIX}-gcc) packages+=(${MINGW_PACKAGE_PREFIX}-gflags) packages+=(${MINGW_PACKAGE_PREFIX}-grpc) packages+=(${MINGW_PACKAGE_PREFIX}-gtest) + packages+=(${MINGW_PACKAGE_PREFIX}-libutf8proc) packages+=(${MINGW_PACKAGE_PREFIX}-llvm) packages+=(${MINGW_PACKAGE_PREFIX}-lz4) + packages+=(${MINGW_PACKAGE_PREFIX}-ninja) packages+=(${MINGW_PACKAGE_PREFIX}-polly) packages+=(${MINGW_PACKAGE_PREFIX}-protobuf) packages+=(${MINGW_PACKAGE_PREFIX}-python3-numpy) packages+=(${MINGW_PACKAGE_PREFIX}-rapidjson) packages+=(${MINGW_PACKAGE_PREFIX}-snappy) packages+=(${MINGW_PACKAGE_PREFIX}-thrift) - packages+=(${MINGW_PACKAGE_PREFIX}-libutf8proc) packages+=(${MINGW_PACKAGE_PREFIX}-zlib) packages+=(${MINGW_PACKAGE_PREFIX}-zstd) ;; diff --git a/ci/scripts/msys2_system_upgrade_phase1.sh b/ci/scripts/msys2_system_upgrade_phase1.sh index 0839228f419..aecd3089332 100755 --- a/ci/scripts/msys2_system_upgrade_phase1.sh +++ b/ci/scripts/msys2_system_upgrade_phase1.sh @@ -20,6 +20,9 @@ set -eux # https://www.msys2.org/news/#2020-06-29-new-packagers +msys2_repo_base_url=https://repo.msys2.org/msys +# Mirror +msys2_repo_base_url=https://sourceforge.net/projects/msys2/files/REPOS/MSYS2 msys2_keyring_pkg=msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz for suffix in "" ".sig"; do curl \ @@ -27,7 +30,7 @@ for suffix in "" ".sig"; do --remote-name \ --show-error \ --silent \ - https://repo.msys2.org/msys/x86_64/${msys2_keyring_pkg}${suffix} + ${msys2_repo_base_url}/x86_64/${msys2_keyring_pkg}${suffix} done pacman-key --verify ${msys2_keyring_pkg}.sig pacman \ diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 6f961d2f8e0..80a9cdef4a3 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -29,4 +29,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} # Enable some checks inside Python itself export PYTHONDEVMODE=1 -pytest -r s --pyargs pyarrow +pytest -r s ${PYTEST_ARGS} --pyargs pyarrow diff --git a/ci/scripts/r_deps.sh b/ci/scripts/r_deps.sh index a2dc58fd97b..7e9d2eac7a9 100755 --- a/ci/scripts/r_deps.sh +++ b/ci/scripts/r_deps.sh @@ -25,7 +25,7 @@ source_dir=${1}/r pushd ${source_dir} # Install R package dependencies -${R_BIN} -e "install.packages('remotes'); remotes::install_cran(c('glue', 'rcmdcheck'))" +${R_BIN} -e "install.packages('remotes'); remotes::install_cran(c('glue', 'rcmdcheck', 'sys'))" ${R_BIN} -e "remotes::install_deps(dependencies = TRUE)" popd diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 1d7e8de8bf5..e6594e03a88 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -39,6 +39,25 @@ if [ "$RHUB_PLATFORM" = "linux-x86_64-fedora-clang" ]; then rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak fi +# Install openssl for S3 support +if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then + if [ "`which dnf`" ]; then + dnf install -y libcurl-devel openssl-devel + elif [ "`which yum`" ]; then + yum install -y libcurl-devel openssl-devel + elif [ "`which zypper`" ]; then + zypper install -y libcurl-devel libopenssl-devel + else + apt-get update + apt-get install -y libcurl4-openssl-dev libssl-dev + fi + + # The Dockerfile should have put this file here + if [ -f "/arrow/ci/scripts/install_minio.sh" ] && [ "`which wget`" ]; then + /arrow/ci/scripts/install_minio.sh amd64 linux latest /usr/local + fi +fi + # Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786 Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}' diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 05c70d8a560..a2428e912be 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -59,6 +59,13 @@ ${R_BIN} -e "as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true') if (as_cran) { rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check') } else { + if (nzchar(Sys.which('minio'))) { + message('Running minio for S3 tests (if build supports them)') + minio_dir <- tempfile() + dir.create(minio_dir) + pid <- sys::exec_background('minio', c('server', minio_dir)) + on.exit(tools::pskill(pid)) + } rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check') }" diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index ed9e211f9ac..cb33e676a7d 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -27,13 +27,25 @@ if [ "$RTOOLS_VERSION" = "35" ]; then # Use rtools-backports if building with rtools35 curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf # Update keys: https://www.msys2.org/news/#2020-06-29-new-packagers - curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" + msys2_repo_base_url=https://repo.msys2.org/msys + # Mirror + msys2_repo_base_url=https://sourceforge.net/projects/msys2/files/REPOS/MSYS2 + curl -OSsL "${msys2_repo_base_url}/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz + # Use sf.net instead of http://repo.msys2.org/ temporary. + sed -i -e "s,^Server = http://repo\.msys2\.org/msys,Server = ${msys2_repo_base_url},g" \ + /etc/pacman.conf pacman --noconfirm -Scc pacman --noconfirm -Syy # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) RWINLIB_LIB_DIR="lib-4.9.3" else + # Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN + # curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf + # curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" + # pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz + # pacman --noconfirm -Scc + pacman --noconfirm -Syy RWINLIB_LIB_DIR="lib" fi @@ -84,8 +96,8 @@ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x6 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ -cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/x64 -cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/i386 +cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/x64 +cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/i386 # Create build artifact zip -r ${DST_DIR}.zip $DST_DIR diff --git a/ci/scripts/rust_coverage.sh b/ci/scripts/rust_coverage.sh new file mode 100755 index 00000000000..fbe5b0d853a --- /dev/null +++ b/ci/scripts/rust_coverage.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +source_dir=${1}/rust +build_dir=${2}/rust +rust=${3} + +export ARROW_TEST_DATA=${arrow_dir}/testing/data +export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data +export CARGO_TARGET_DIR=${build_dir} + +pushd ${source_dir} + +rustup default ${rust} +rustup component add rustfmt --toolchain ${rust}-x86_64-unknown-linux-gnu +cargo install cargo-tarpaulin + +cargo tarpaulin --out Xml + +popd diff --git a/cpp/Brewfile b/cpp/Brewfile index 6887956f273..7de6c7deabe 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -28,13 +28,16 @@ brew "grpc" brew "llvm" brew "llvm@8" brew "lz4" +brew "minio" brew "ninja" brew "numpy" brew "openssl@1.1" brew "protobuf" brew "python" brew "rapidjson" -brew "re2" +# grpc bundles re2 and causes a conflict when Homebrew tries to install it, +# so temporarily skip installing re2. See ARROW-9972. +# brew "re2" brew "snappy" brew "thrift" brew "wget" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 721ed22811f..f9ab1548fbd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,7 +54,7 @@ if(POLICY CMP0063) cmake_policy(SET CMP0063 NEW) endif() -set(ARROW_VERSION "2.0.0-SNAPSHOT") +set(ARROW_VERSION "3.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -109,7 +109,12 @@ set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") -set(ARROW_LLVM_VERSIONS "10" "9" "8" "7") +set(ARROW_LLVM_VERSIONS + "11" + "10" + "9" + "8" + "7") list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR @@ -148,6 +153,13 @@ if(APPLE) endif() endif() +if(WIN32 AND NOT MINGW) + # This is used to handle builds using e.g. clang in an MSVC setting. + set(MSVC_TOOLCHAIN TRUE) +else() + set(MSVC_TOOLCHAIN FALSE) +endif() + find_package(ClangTools) find_package(InferTools) if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND OR INFER_FOUND) @@ -315,6 +327,7 @@ if(ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_INTEGRATION OR ARROW_FUZZING) set(ARROW_JSON ON) + set(ARROW_TESTING ON) endif() if(ARROW_CUDA @@ -343,7 +356,7 @@ if(ARROW_PYTHON) set(ARROW_JSON ON) endif() -if(MSVC) +if(MSVC_TOOLCHAIN) # ORC doesn't build on windows set(ARROW_ORC OFF) # Plasma using glog is not fully tested on windows. @@ -410,6 +423,10 @@ if(ARROW_TEST_MEMCHECK) add_definitions(-DARROW_VALGRIND) endif() +if(ARROW_USE_UBSAN) + add_definitions(-DARROW_UBSAN) +endif() + # # Compiler flags # @@ -718,6 +735,7 @@ endif() if(ARROW_S3) list(APPEND ARROW_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) + list(APPEND ARROW_STATIC_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) endif() if(ARROW_WITH_UTF8PROC) @@ -741,7 +759,7 @@ add_dependencies(arrow_test_dependencies toolchain-tests) if(ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) if(ARROW_ORC) - if(NOT MSVC) + if(NOT MSVC_TOOLCHAIN) list(APPEND ARROW_STATIC_LINK_LIBS ${CMAKE_DL_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) endif() @@ -755,7 +773,7 @@ if(((ARROW_FLIGHT OR ARROW_S3) AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION list(APPEND ARROW_TEST_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY}) endif() -if(NOT MSVC) +if(NOT MSVC_TOOLCHAIN) list(APPEND ARROW_LINK_LIBS ${CMAKE_DL_LIBS}) list(APPEND ARROW_SHARED_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) endif() diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 7e2a22f069a..1cda600d154 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -96,12 +96,14 @@ function run_test() { # even when retries are successful. rm -f $XMLFILE - $TEST_EXECUTABLE "$@" 2>&1 \ + $TEST_EXECUTABLE "$@" > $LOGFILE.raw 2>&1 + STATUS=$? + cat $LOGFILE.raw \ | ${PYTHON:-python} $ROOT/build-support/asan_symbolize.py \ | ${CXXFILT:-c++filt} \ | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \ | $pipe_cmd 2>&1 | tee $LOGFILE - STATUS=$? + rm -f $LOGFILE.raw # TSAN doesn't always exit with a non-zero exit code due to a bug: # mutex errors don't get reported through the normal error reporting infrastructure. diff --git a/cpp/build-support/sanitizer-disallowed-entries.txt b/cpp/build-support/sanitizer-disallowed-entries.txt index f6900c643db..636cfda233a 100644 --- a/cpp/build-support/sanitizer-disallowed-entries.txt +++ b/cpp/build-support/sanitizer-disallowed-entries.txt @@ -20,3 +20,6 @@ # Seen error: # thirdparty/gmock-1.7.0/include/gmock/gmock-spec-builders.h:1529:12: runtime error: member call on null pointer of type 'testing::internal::ActionResultHolder' fun:*testing*internal*InvokeWith* + +# Workaround for RapidJSON https://github.com/Tencent/rapidjson/issues/1724 +src:*/rapidjson/internal/* diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index f92966f78eb..2fd897b5d1d 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -30,6 +30,22 @@ set(ARROW_LIBRARY_PATH_SUFFIXES "Library/bin") set(ARROW_INCLUDE_PATH_SUFFIXES "include" "Library" "Library/include") +set(ARROW_BOOST_PROCESS_COMPILE_DEFINITIONS) +if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # boost/process/detail/windows/handle_workaround.hpp doesn't work + # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't + # provide __kernel_entry without winternl.h. + # + # See also: + # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp + # + # You can use this like the following: + # + # target_compile_definitions(target PRIVATE + # ${ARROW_BOOST_PROCESS_COMPILE_DEFINITIONS}) + list(APPEND ARROW_BOOST_PROCESS_COMPILE_DEFINITIONS "BOOST_USE_WINDOWS_H=1") +endif() + function(ADD_THIRDPARTY_LIB LIB_NAME) set(options) set(one_value_args SHARED_LIB STATIC_LIB) @@ -161,7 +177,7 @@ function(create_merged_static_lib output_target) "-o" ${output_lib_path} ${all_library_paths}) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU)$") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU|Intel)$") set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) file(WRITE ${ar_script_path}.in "CREATE ${output_lib_path}\n") @@ -424,7 +440,7 @@ function(ADD_ARROW_LIB LIB_NAME) target_include_directories(${LIB_NAME}_static PRIVATE ${ARG_PRIVATE_INCLUDES}) endif() - if(MSVC) + if(MSVC_TOOLCHAIN) set(LIB_NAME_STATIC ${LIB_NAME}_static) else() set(LIB_NAME_STATIC ${LIB_NAME}) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index cf860cb4189..a68c3a92cc7 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -105,15 +105,27 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") OFF) define_option_string(ARROW_SIMD_LEVEL - "SIMD compiler optimization level" + "Compile-time SIMD optimization level" "SSE4_2" # default to SSE4.2 "NONE" "SSE4_2" "AVX2" "AVX512") + define_option_string(ARROW_RUNTIME_SIMD_LEVEL + "Max runtime SIMD optimization level" + "MAX" # default to max supported by compiler + "NONE" + "SSE4_2" + "AVX2" + "AVX512" + "MAX") + # Arm64 architectures and extensions can lead to exploding combinations. # So set it directly through cmake command line. + # + # If you change this, you need to change the definition in + # python/CMakeLists.txt too. define_option_string(ARROW_ARMV8_ARCH "Arm64 arch and extensions" "armv8-a" # Default @@ -244,6 +256,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF) + define_option(ARROW_TESTING "Build the Arrow testing libraries" OFF) + #---------------------------------------------------------------------- set_option_category("Thirdparty toolchain") @@ -304,6 +318,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_LZ4_USE_SHARED "Rely on lz4 shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) + define_option(ARROW_OPENSSL_USE_SHARED "Rely on OpenSSL shared libraries where relevant" + ${ARROW_DEPENDENCY_USE_SHARED}) + define_option(ARROW_PROTOBUF_USE_SHARED "Rely on Protocol Buffers shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) @@ -348,7 +365,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build with support for Unicode properties using the utf8proc library" ON) #---------------------------------------------------------------------- - if(MSVC) + if(MSVC_TOOLCHAIN) set_option_category("MSVC") define_option(MSVC_LINK_VERBOSE diff --git a/cpp/cmake_modules/FindArrow.cmake b/cpp/cmake_modules/FindArrow.cmake index 02fd9a15801..9c987665896 100644 --- a/cpp/cmake_modules/FindArrow.cmake +++ b/cpp/cmake_modules/FindArrow.cmake @@ -39,6 +39,13 @@ endif() include(FindPkgConfig) include(FindPackageHandleStandardArgs) +if(WIN32 AND NOT MINGW) + # This is used to handle builds using e.g. clang in an MSVC setting. + set(MSVC_TOOLCHAIN TRUE) +else() + set(MSVC_TOOLCHAIN FALSE) +endif() + set(ARROW_SEARCH_LIB_PATH_SUFFIXES) if(CMAKE_LIBRARY_ARCHITECTURE) list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}") @@ -61,7 +68,7 @@ if(CMAKE_BUILD_TYPE) endif() if(NOT DEFINED ARROW_MSVC_STATIC_LIB_SUFFIX) - if(MSVC) + if(MSVC_TOOLCHAIN) set(ARROW_MSVC_STATIC_LIB_SUFFIX "_static") else() set(ARROW_MSVC_STATIC_LIB_SUFFIX "") @@ -147,7 +154,7 @@ macro(arrow_find_package_home) set(include_dir "${${prefix}_include_dir}") set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) - if(MSVC) + if(MSVC_TOOLCHAIN) set(CMAKE_SHARED_LIBRARY_SUFFIXES_ORIGINAL ${CMAKE_FIND_LIBRARY_SUFFIXES}) # .dll isn't found by find_library with MSVC because .dll isn't included in # CMAKE_FIND_LIBRARY_SUFFIXES. @@ -158,7 +165,7 @@ macro(arrow_find_package_home) PATHS "${home}" PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} NO_DEFAULT_PATH) - if(MSVC) + if(MSVC_TOOLCHAIN) set(CMAKE_SHARED_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_ORIGINAL}) endif() set(shared_lib "${${prefix}_shared_lib}") diff --git a/cpp/cmake_modules/FindBoostAlt.cmake b/cpp/cmake_modules/FindBoostAlt.cmake index 8f16439bf86..300080d4fb1 100644 --- a/cpp/cmake_modules/FindBoostAlt.cmake +++ b/cpp/cmake_modules/FindBoostAlt.cmake @@ -52,7 +52,7 @@ endif() if(Boost_FOUND) set(BoostAlt_FOUND ON) - if(MSVC) + if(MSVC_TOOLCHAIN) # disable autolinking in boost add_definitions(-DBOOST_ALL_NO_LIB) if(ARROW_BOOST_USE_SHARED) diff --git a/cpp/cmake_modules/FindGTest.cmake b/cpp/cmake_modules/FindGTest.cmake index 4be2cf57046..8581d921b1c 100644 --- a/cpp/cmake_modules/FindGTest.cmake +++ b/cpp/cmake_modules/FindGTest.cmake @@ -171,7 +171,7 @@ if(NOT DEFINED GTEST_MSVC_SEARCH) endif() set(_gtest_libpath_suffixes lib) -if(MSVC) +if(MSVC_TOOLCHAIN) if(GTEST_MSVC_SEARCH STREQUAL "MD") list(APPEND _gtest_libpath_suffixes msvc/gtest-md/Debug @@ -198,7 +198,7 @@ find_path(GTEST_INCLUDE_DIR gtest/gtest.h ) mark_as_advanced(GTEST_INCLUDE_DIR) -if(MSVC AND GTEST_MSVC_SEARCH STREQUAL "MD") +if(MSVC_TOOLCHAIN AND GTEST_MSVC_SEARCH STREQUAL "MD") # The provided /MD project files for Google Test add -md suffixes to the # library names. __gtest_find_library(GTEST_LIBRARY gtest-md gtest) diff --git a/cpp/cmake_modules/FindLz4.cmake b/cpp/cmake_modules/FindLz4.cmake index dbc6c5f8d3b..7159f96f70f 100644 --- a/cpp/cmake_modules/FindLz4.cmake +++ b/cpp/cmake_modules/FindLz4.cmake @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -if(MSVC AND NOT DEFINED LZ4_MSVC_LIB_PREFIX) +if(MSVC_TOOLCHAIN AND NOT DEFINED LZ4_MSVC_LIB_PREFIX) set(LZ4_MSVC_LIB_PREFIX "lib") endif() set(LZ4_LIB_NAME_BASE "${LZ4_MSVC_LIB_PREFIX}lz4") diff --git a/cpp/cmake_modules/FindPython3Alt.cmake b/cpp/cmake_modules/FindPython3Alt.cmake index 11f26f5a1e4..131a0d395fc 100644 --- a/cpp/cmake_modules/FindPython3Alt.cmake +++ b/cpp/cmake_modules/FindPython3Alt.cmake @@ -36,16 +36,26 @@ if(${CMAKE_VERSION} VERSION_LESS "3.15.0") find_package_handle_standard_args(Python3Alt REQUIRED_VARS PYTHON_EXECUTABLE - PYTHON_LIBRARIES PYTHON_INCLUDE_DIRS NUMPY_INCLUDE_DIRS) return() endif() -if(Python3Alt_FIND_REQUIRED) - find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED) +if(${CMAKE_VERSION} VERSION_LESS "3.18.0" OR ARROW_BUILD_TESTS) + # When building arrow-python-test, we need libpython to be present, so ask for + # the full "Development" component. Also ask for it on CMake < 3.18, + # where "Development.Module" is not available. + if(Python3Alt_FIND_REQUIRED) + find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED) + else() + find_package(Python3 COMPONENTS Interpreter Development NumPy) + endif() else() - find_package(Python3 COMPONENTS Interpreter Development NumPy) + if(Python3Alt_FIND_REQUIRED) + find_package(Python3 COMPONENTS Interpreter Development.Module NumPy REQUIRED) + else() + find_package(Python3 COMPONENTS Interpreter Development.Module NumPy) + endif() endif() if(NOT Python3_FOUND) @@ -85,6 +95,5 @@ endfunction() find_package_handle_standard_args(Python3Alt REQUIRED_VARS PYTHON_EXECUTABLE - PYTHON_LIBRARIES PYTHON_INCLUDE_DIRS NUMPY_INCLUDE_DIRS) diff --git a/cpp/cmake_modules/FindRE2.cmake b/cpp/cmake_modules/FindRE2.cmake index 4562e3ae89b..645a20f7c09 100644 --- a/cpp/cmake_modules/FindRE2.cmake +++ b/cpp/cmake_modules/FindRE2.cmake @@ -63,8 +63,17 @@ endif() find_package_handle_standard_args(RE2 REQUIRED_VARS RE2_LIB RE2_INCLUDE_DIR) if(RE2_FOUND) - add_library(RE2::re2 UNKNOWN IMPORTED) - set_target_properties(RE2::re2 - PROPERTIES IMPORTED_LOCATION "${RE2_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${RE2_INCLUDE_DIR}") + if(NOT TARGET RE2::re2) + add_library(RE2::re2 UNKNOWN IMPORTED) + set_target_properties(RE2::re2 + PROPERTIES IMPORTED_LOCATION "${RE2_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${RE2_INCLUDE_DIR}") + endif() + # Some third-party dependencies (namely gRPC) are on the look-out for a lower-case re2 Target. + if(NOT TARGET re2::re2) + add_library(re2::re2 UNKNOWN IMPORTED) + set_target_properties(re2::re2 + PROPERTIES IMPORTED_LOCATION "${RE2_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${RE2_INCLUDE_DIR}") + endif() endif() diff --git a/cpp/cmake_modules/FindThrift.cmake b/cpp/cmake_modules/FindThrift.cmake index d266f02e29b..273d907ed07 100644 --- a/cpp/cmake_modules/FindThrift.cmake +++ b/cpp/cmake_modules/FindThrift.cmake @@ -39,7 +39,7 @@ function(EXTRACT_THRIFT_VERSION) endif() endfunction(EXTRACT_THRIFT_VERSION) -if(MSVC AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) +if(MSVC_TOOLCHAIN AND NOT DEFINED THRIFT_MSVC_LIB_SUFFIX) if(NOT ARROW_THRIFT_USE_SHARED) if(ARROW_USE_STATIC_CRT) set(THRIFT_MSVC_LIB_SUFFIX "mt") @@ -133,7 +133,7 @@ if(Thrift_FOUND OR THRIFT_FOUND) set_target_properties(thrift::thrift PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${THRIFT_INCLUDE_DIR}") - if(WIN32 AND NOT MSVC) + if(WIN32 AND NOT MSVC_TOOLCHAIN) # We don't need this for Visual C++ because Thrift uses # "#pragma comment(lib, "Ws2_32.lib")" in # thrift/windows/config.h for Visual C++. diff --git a/cpp/cmake_modules/Findzstd.cmake b/cpp/cmake_modules/Findzstd.cmake index f7c68134e9d..6659a682da7 100644 --- a/cpp/cmake_modules/Findzstd.cmake +++ b/cpp/cmake_modules/Findzstd.cmake @@ -23,18 +23,24 @@ set(ZSTD_LIB_NAME_BASE "${ZSTD_MSVC_LIB_PREFIX}zstd") if(ARROW_ZSTD_USE_SHARED) set(ZSTD_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND ZSTD_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}") + list( + APPEND + ZSTD_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + ) endif() - list(APPEND ZSTD_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}") + list( + APPEND + ZSTD_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}") else() if(MSVC AND NOT DEFINED ZSTD_MSVC_STATIC_LIB_SUFFIX) set(ZSTD_MSVC_STATIC_LIB_SUFFIX "_static") endif() set(ZSTD_STATIC_LIB_SUFFIX "${ZSTD_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(ZSTD_LIB_NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${ZSTD_STATIC_LIB_SUFFIX}") + set(ZSTD_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${ZSTD_STATIC_LIB_SUFFIX}") endif() # First, find via if specified ZTD_ROOT @@ -66,7 +72,9 @@ else() PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) else() # Third, check all other CMake paths - find_library(ZSTD_LIB NAMES ${ZSTD_LIB_NAMES} PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_library(ZSTD_LIB + NAMES ${ZSTD_LIB_NAMES} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) find_path(ZSTD_INCLUDE_DIR NAMES zstd.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index bf1767260db..a5cd95bd7ab 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -18,6 +18,7 @@ # Check if the target architecture and compiler supports some special # instruction sets that would boost performance. include(CheckCXXCompilerFlag) +include(CheckCXXSourceCompiles) # Get cpu architecture message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") @@ -46,9 +47,13 @@ if(ARROW_CPU_FLAG STREQUAL "x86") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") - set(ARROW_AVX2_FLAG "-mavx2") + set(ARROW_AVX2_FLAG "-march=haswell") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ARROW_AVX512_FLAG "-march=skylake-avx512 -mbmi2") + # Append the avx2/avx512 subset option also, fix issue ARROW-9877 for homebrew-cpp + set(ARROW_AVX2_FLAG "${ARROW_AVX2_FLAG} -mavx2") + set(ARROW_AVX512_FLAG + "${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) @@ -56,17 +61,37 @@ if(ARROW_CPU_FLAG STREQUAL "x86") # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 message(STATUS "Disable AVX512 support on MINGW for now") else() - check_cxx_compiler_flag(${ARROW_AVX512_FLAG} CXX_SUPPORTS_AVX512) + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ARROW_AVX512_FLAG}") + check_cxx_source_compiles(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() - # Runtime SIMD level it can get from compiler - if(CXX_SUPPORTS_SSE4_2) + # Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL + if(CXX_SUPPORTS_SSE4_2 + AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SSE4_2|AVX2|AVX512|MAX)$") + set(ARROW_HAVE_RUNTIME_SSE4_2 ON) add_definitions(-DARROW_HAVE_RUNTIME_SSE4_2) endif() - if(CXX_SUPPORTS_AVX2) - add_definitions(-DARROW_HAVE_RUNTIME_AVX2) + if(CXX_SUPPORTS_AVX2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$") + set(ARROW_HAVE_RUNTIME_AVX2 ON) + add_definitions(-DARROW_HAVE_RUNTIME_AVX2 -DARROW_HAVE_RUNTIME_BMI2) endif() - if(CXX_SUPPORTS_AVX512) - add_definitions(-DARROW_HAVE_RUNTIME_AVX512) + if(CXX_SUPPORTS_AVX512 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") + set(ARROW_HAVE_RUNTIME_AVX512 ON) + add_definitions(-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2) endif() elseif(ARROW_CPU_FLAG STREQUAL "ppc") # power compiler flags, gcc/clang only @@ -101,7 +126,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) set(UNKNOWN_COMPILER_MESSAGE - "Unknown compiler: ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER_VERSION}") + "Unknown compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") # compiler flags that are common across debug/release builds if(WIN32) @@ -258,7 +283,8 @@ else() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /W3") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU" + OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") else() message(FATAL_ERROR "${UNKNOWN_COMPILER_MESSAGE}") @@ -283,6 +309,13 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -Wno-noexcept-type") endif() + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.2") + # Disabling semantic interposition allows faster calling conventions + # when calling global functions internally, and can also help inlining. + # See https://stackoverflow.com/questions/35745543/new-option-in-gcc-5-3-fno-semantic-interposition + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-semantic-interposition") + endif() + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9") # Add colors when paired with ninja set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index f983d58f9b9..d084f346803 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -249,6 +249,10 @@ if(ARROW_ORC OR ARROW_FLIGHT OR ARROW_GANDIVA) set(ARROW_WITH_PROTOBUF ON) endif() +if(ARROW_S3) + set(ARROW_WITH_ZLIB ON) +endif() + if(NOT ARROW_COMPUTE) # utf8proc is only potentially used in kernels for now set(ARROW_WITH_UTF8PROC OFF) @@ -300,6 +304,33 @@ else() "https://github.com/abseil/abseil-cpp/archive/${ARROW_ABSL_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_AWS_C_COMMON_URL}) + set(AWS_C_COMMON_SOURCE_URL "$ENV{ARROW_AWS_C_COMMON_URL}") +else() + set_urls( + AWS_C_COMMON_SOURCE_URL + "https://github.com/awslabs/aws-c-common/archive/${ARROW_AWS_C_COMMON_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AWS_CHECKSUMS_URL}) + set(AWS_CHECKSUMS_SOURCE_URL "$ENV{ARROW_AWS_CHECKSUMS_URL}") +else() + set_urls( + AWS_CHECKSUMS_SOURCE_URL + "https://github.com/awslabs/aws-checksums/archive/${ARROW_AWS_CHECKSUMS_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AWS_C_EVENT_STREAM_URL}) + set(AWS_C_EVENT_STREAM_SOURCE_URL "$ENV{ARROW_AWS_C_EVENT_STREAM_URL}") +else() + set_urls( + AWS_C_EVENT_STREAM_SOURCE_URL + "https://github.com/awslabs/aws-c-event-stream/archive/${ARROW_AWS_C_EVENT_STREAM_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_AWSSDK_URL}) set(AWSSDK_SOURCE_URL "$ENV{ARROW_AWSSDK_URL}") else() @@ -552,7 +583,7 @@ endif() set(EP_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}") set(EP_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") -if(NOT MSVC) +if(NOT MSVC_TOOLCHAIN) # Set -fPIC on all external projects set(EP_CXX_FLAGS "${EP_CXX_FLAGS} -fPIC") set(EP_C_FLAGS "${EP_C_FLAGS} -fPIC") @@ -971,8 +1002,24 @@ endif() set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT OR ARROW_S3) - # This must work - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) + # OpenSSL is required + if(ARROW_OPENSSL_USE_SHARED) + # Find shared OpenSSL libraries. + set(OpenSSL_USE_STATIC_LIBS OFF) + # Seems that different envs capitalize this differently? + set(OPENSSL_USE_STATIC_LIBS OFF) + set(BUILD_SHARED_LIBS_KEEP ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS ON) + + find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) + set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_KEEP}) + unset(BUILD_SHARED_LIBS_KEEP) + else() + # Find static OpenSSL headers and libs + set(OpenSSL_USE_STATIC_LIBS ON) + set(OPENSSL_USE_STATIC_LIBS ON) + find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) + endif() set(ARROW_USE_OPENSSL ON) endif() @@ -1348,7 +1395,7 @@ if(ARROW_WITH_PROTOBUF) endif() resolve_dependency(Protobuf REQUIRED_VERSION ${ARROW_PROTOBUF_REQUIRED_VERSION}) - if(ARROW_PROTOBUF_USE_SHARED AND MSVC) + if(ARROW_PROTOBUF_USE_SHARED AND MSVC_TOOLCHAIN) add_definitions(-DPROTOBUF_USE_DLLS) endif() @@ -1454,7 +1501,7 @@ if(ARROW_JEMALLOC) BUILD_IN_SOURCE 1 BUILD_COMMAND ${JEMALLOC_BUILD_COMMAND} BUILD_BYPRODUCTS "${JEMALLOC_STATIC_LIB}" - INSTALL_COMMAND ${MAKE} install) + INSTALL_COMMAND ${MAKE} -j1 install) # Don't use the include directory directly so that we can point to a path # that is unique to our codebase. @@ -1647,10 +1694,7 @@ macro(build_gtest) add_dependencies(GTest::gmock googletest_ep) endmacro() -if(ARROW_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS - OR ARROW_BUILD_INTEGRATION - OR ARROW_FUZZING) +if(ARROW_TESTING) resolve_dependency(GTest) if(NOT GTEST_VENDORED) @@ -1780,6 +1824,7 @@ macro(build_rapidjson) set(RAPIDJSON_INCLUDE_DIR "${RAPIDJSON_PREFIX}/include") add_dependencies(toolchain rapidjson_ep) + add_dependencies(toolchain-tests rapidjson_ep) add_dependencies(rapidjson rapidjson_ep) set(RAPIDJSON_VENDORED TRUE) @@ -1850,9 +1895,11 @@ macro(build_zlib) file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") add_library(ZLIB::ZLIB STATIC IMPORTED) + set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) + set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") set_target_properties(ZLIB::ZLIB - PROPERTIES IMPORTED_LOCATION "${ZLIB_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ZLIB_PREFIX}/include") + PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}) add_dependencies(toolchain zlib_ep) add_dependencies(ZLIB::ZLIB zlib_ep) @@ -2195,6 +2242,13 @@ macro(build_cares) INTERFACE_INCLUDE_DIRECTORIES "${CARES_INCLUDE_DIR}") add_dependencies(c-ares::cares cares_ep) + if(APPLE) + # libresolv must be linked from c-ares version 1.16.1 + find_library(LIBRESOLV_LIBRARY NAMES resolv libresolv REQUIRED) + set_target_properties(c-ares::cares + PROPERTIES INTERFACE_LINK_LIBRARIES "${LIBRESOLV_LIBRARY}") + endif() + set(CARES_VENDORED TRUE) list(APPEND ARROW_BUNDLED_STATIC_LIBS c-ares::cares) @@ -2612,12 +2666,14 @@ endif() # AWS SDK for C++ macro(build_awssdk) - message( - FATAL_ERROR "FIXME: Building AWS C++ SDK from source will link with wrong libcrypto") message("Building AWS C++ SDK from source") - + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") + message(FATAL_ERROR "AWS C++ SDK requires gcc >= 4.9") + endif() set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install") set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include") + set(AWSSDK_LIB_DIR "lib") if(WIN32) # On Windows, need to match build types @@ -2626,56 +2682,178 @@ macro(build_awssdk) # Otherwise, always build in release mode. # Especially with gcc, debug builds can fail with "asm constraint" errors: # https://github.com/TileDB-Inc/TileDB/issues/1351 - set(AWSSDK_BUILD_TYPE Release) + set(AWSSDK_BUILD_TYPE release) endif() - set(AWSSDK_CMAKE_ARGS - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_ONLY=s3;core;config - -DENABLE_UNITY_BUILD=on - -DENABLE_TESTING=off - "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" - "-DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}") + set(AWSSDK_COMMON_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_BUILD_TYPE=${AWSSDK_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR=${AWSSDK_LIB_DIR} + -DENABLE_TESTING=OFF + -DENABLE_UNITY_BUILD=ON + "-DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}" + "-DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX}") set( - AWSSDK_CORE_SHARED_LIB - "${AWSSDK_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}aws-cpp-sdk-core${CMAKE_SHARED_LIBRARY_SUFFIX}" - ) - set( - AWSSDK_S3_SHARED_LIB - "${AWSSDK_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}aws-cpp-sdk-s3${CMAKE_SHARED_LIBRARY_SUFFIX}" - ) - set(AWSSDK_SHARED_LIBS "${AWSSDK_CORE_SHARED_LIB}" "${AWSSDK_S3_SHARED_LIB}") + AWSSDK_CMAKE_ARGS + ${AWSSDK_COMMON_CMAKE_ARGS} -DBUILD_DEPS=OFF + -DBUILD_ONLY=config\\$s3\\$transfer\\$identity-management\\$sts + -DMINIMIZE_SIZE=ON) + if(UNIX AND TARGET zlib_ep) + list(APPEND AWSSDK_CMAKE_ARGS -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS} + -DZLIB_LIBRARY=${ZLIB_LIBRARIES}) + endif() + + file(MAKE_DIRECTORY ${AWSSDK_INCLUDE_DIR}) + + # AWS C++ SDK related libraries to link statically + set(_AWSSDK_LIBS + aws-cpp-sdk-identity-management + aws-cpp-sdk-sts + aws-cpp-sdk-cognito-identity + aws-cpp-sdk-s3 + aws-cpp-sdk-core + aws-c-event-stream + aws-checksums + aws-c-common) + set(AWSSDK_LIBRARIES) + foreach(_AWSSDK_LIB ${_AWSSDK_LIBS}) + # aws-c-common -> AWS-C-COMMON + string(TOUPPER ${_AWSSDK_LIB} _AWSSDK_LIB_UPPER) + # AWS-C-COMMON -> AWS_C_COMMON + string(REPLACE "-" "_" _AWSSDK_LIB_NAME_PREFIX ${_AWSSDK_LIB_UPPER}) + set( + _AWSSDK_STATIC_LIBRARY + "${AWSSDK_PREFIX}/${AWSSDK_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") + set(_AWSSDK_TARGET_NAME ${_AWSSDK_LIB}) + else() + set(_AWSSDK_TARGET_NAME AWS::${_AWSSDK_LIB}) + endif() + add_library(${_AWSSDK_TARGET_NAME} STATIC IMPORTED) + set_target_properties( + ${_AWSSDK_TARGET_NAME} + PROPERTIES IMPORTED_LOCATION ${_AWSSDK_STATIC_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + "${AWSSDK_INCLUDE_DIR}") + set("${_AWSSDK_LIB_NAME_PREFIX}_STATIC_LIBRARY" ${_AWSSDK_STATIC_LIBRARY}) + list(APPEND AWSSDK_LIBRARIES ${_AWSSDK_TARGET_NAME}) + endforeach() + + externalproject_add(aws_c_common_ep + ${EP_LOG_OPTIONS} + URL ${AWS_C_COMMON_SOURCE_URL} + CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AWS_C_COMMON_STATIC_LIBRARY}) + add_dependencies(AWS::aws-c-common aws_c_common_ep) + + externalproject_add(aws_checksums_ep + ${EP_LOG_OPTIONS} + URL ${AWS_CHECKSUMS_SOURCE_URL} + CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AWS_CHECKSUMS_STATIC_LIBRARY}) + add_dependencies(AWS::aws-checksums aws_checksums_ep) + + externalproject_add(aws_c_event_stream_ep + ${EP_LOG_OPTIONS} + URL ${AWS_C_EVENT_STREAM_SOURCE_URL} + CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AWS_C_EVENT_STREAM_STATIC_LIBRARY} + DEPENDS aws_c_common_ep aws_checksums_ep) + add_dependencies(AWS::aws-c-event-stream aws_c_event_stream_ep) externalproject_add(awssdk_ep ${EP_LOG_OPTIONS} URL ${AWSSDK_SOURCE_URL} CMAKE_ARGS ${AWSSDK_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWSSDK_SHARED_LIBS}) - - file(MAKE_DIRECTORY ${AWSSDK_INCLUDE_DIR}) - + BUILD_BYPRODUCTS ${AWS_CPP_SDK_COGNITO_IDENTITY_STATIC_LIBRARY} + ${AWS_CPP_SDK_CORE_STATIC_LIBRARY} + ${AWS_CPP_SDK_IDENTITY_MANAGEMENT_STATIC_LIBRARY} + ${AWS_CPP_SDK_S3_STATIC_LIBRARY} + ${AWS_CPP_SDK_STS_STATIC_LIBRARY} + DEPENDS aws_c_event_stream_ep) add_dependencies(toolchain awssdk_ep) - set(AWSSDK_LINK_LIBRARIES ${AWSSDK_SHARED_LIBS}) + foreach(_AWSSDK_LIB ${_AWSSDK_LIBS}) + if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") + add_dependencies(${_AWSSDK_LIB} awssdk_ep) + endif() + endforeach() + set(AWSSDK_VENDORED TRUE) + list(APPEND ARROW_BUNDLED_STATIC_LIBS ${AWSSDK_LIBRARIES}) + set(AWSSDK_LINK_LIBRARIES ${AWSSDK_LIBRARIES}) + if(UNIX) + # on linux and macos curl seems to be required + find_package(CURL REQUIRED) + if(NOT TARGET CURL::libcurl) + # For old FindCURL.cmake + add_library(CURL::libcurl UNKNOWN IMPORTED) + set_target_properties(CURL::libcurl + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${CURL_INCLUDE_DIRS}" IMPORTED_LOCATION + "${CURL_LIBRARIES}") + endif() + set_property(TARGET aws-cpp-sdk-core + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES CURL::libcurl) + set_property(TARGET CURL::libcurl + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES OpenSSL::SSL) + if(TARGET zlib_ep) + set_property(TARGET aws-cpp-sdk-core + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES ZLIB::ZLIB) + add_dependencies(awssdk_ep zlib_ep) + endif() + endif() - # AWSSDK is shared-only build + # AWSSDK is static-only build endmacro() if(ARROW_S3) # See https://aws.amazon.com/blogs/developer/developer-experience-of-the-aws-sdk-for-c-now-simplified-by-cmake/ + # Workaround to force AWS cmake configuration to look for shared libraries + if(DEFINED ENV{CONDA_PREFIX}) + if(DEFINED BUILD_SHARED_LIBS) + set(BUILD_SHARED_LIBS_WAS_SET TRUE) + set(BUILD_SHARED_LIBS_VALUE ${BUILD_SHARED_LIBS}) + else() + set(BUILD_SHARED_LIBS_WAS_SET FALSE) + endif() + set(BUILD_SHARED_LIBS "ON") + endif() + # Need to customize the find_package() call, so cannot call resolve_dependency() if(AWSSDK_SOURCE STREQUAL "AUTO") - find_package(AWSSDK COMPONENTS config s3 transfer) + find_package(AWSSDK + COMPONENTS config + s3 + transfer + identity-management + sts) if(NOT AWSSDK_FOUND) build_awssdk() endif() elseif(AWSSDK_SOURCE STREQUAL "BUNDLED") build_awssdk() elseif(AWSSDK_SOURCE STREQUAL "SYSTEM") - find_package(AWSSDK REQUIRED COMPONENTS config s3 transfer) + find_package(AWSSDK REQUIRED + COMPONENTS config + s3 + transfer + identity-management + sts) + endif() + + # Restore previous value of BUILD_SHARED_LIBS + if(DEFINED ENV{CONDA_PREFIX}) + if(BUILD_SHARED_LIBS_WAS_SET) + set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_VALUE}) + else() + unset(BUILD_SHARED_LIBS) + endif() endif() include_directories(SYSTEM ${AWSSDK_INCLUDE_DIR}) @@ -2697,6 +2875,6 @@ message(STATUS "All bundled static libraries: ${ARROW_BUNDLED_STATIC_LIBS}") # Write out the package configurations. -configure_file("src/arrow/util/config.h.cmake" "src/arrow/util/config.h") +configure_file("src/arrow/util/config.h.cmake" "src/arrow/util/config.h" ESCAPE_QUOTES) install(FILES "${ARROW_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util") diff --git a/cpp/cmake_modules/UseCython.cmake b/cpp/cmake_modules/UseCython.cmake index ccdeb4f3f03..0d4b17d3e57 100644 --- a/cpp/cmake_modules/UseCython.cmake +++ b/cpp/cmake_modules/UseCython.cmake @@ -22,7 +22,7 @@ # (this is an inherent limitation of Cython). # # The sample paths set with the CMake include_directories() command will be used -# for include directories to search for *.pxd when running the Cython complire. +# for include directories to search for *.pxd when running the Cython compiler. # # Cache variables that effect the behavior include: # diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index 2e2807801fb..5eee6278009 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -35,14 +35,17 @@ endif() # - disable 'vptr' because of RTTI issues across shared libraries (?) # - disable 'alignment' because unaligned access is really OK on Nehalem and we do it # all over the place. -# - disable 'function' because it appears to give a false positive https://github.com/google/sanitizers/issues/911 +# - disable 'function' because it appears to give a false positive +# (https://github.com/google/sanitizers/issues/911) +# - disable 'float-divide-by-zero' on clang, which considers it UB +# (https://bugs.llvm.org/show_bug.cgi?id=17000#c1) # Note: GCC does not support the 'function' flag. if(${ARROW_USE_UBSAN}) if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set( CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all" + "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function,float-divide-by-zero -fno-sanitize-recover=all" ) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "5.1") diff --git a/cpp/examples/arrow/row-wise-conversion-example.cc b/cpp/examples/arrow/row-wise-conversion-example.cc index 42cab6cc76e..fb54b040f44 100644 --- a/cpp/examples/arrow/row-wise-conversion-example.cc +++ b/cpp/examples/arrow/row-wise-conversion-example.cc @@ -78,7 +78,7 @@ arrow::Status VectorToColumnarTable(const std::vector& rows, // Indicate the start of a new list row. This will memorise the current // offset in the values builder. ARROW_RETURN_NOT_OK(components_builder.Append()); - // Store the actual values. The final nullptr argument tells the underyling + // Store the actual values. The final nullptr argument tells the underlying // builder that all added values are valid, i.e. non-null. ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(), row.cost_components.size())); diff --git a/cpp/examples/minimal_build/example.cc b/cpp/examples/minimal_build/example.cc index 7c60093e95f..4b6acd2a0dd 100644 --- a/cpp/examples/minimal_build/example.cc +++ b/cpp/examples/minimal_build/example.cc @@ -52,8 +52,8 @@ Status RunMain(int argc, char** argv) { ARROW_ASSIGN_OR_RAISE(auto output_file, arrow::io::FileOutputStream::Open(arrow_filename)); ARROW_ASSIGN_OR_RAISE(auto batch_writer, - arrow::ipc::NewFileWriter(output_file.get(), - table->schema())); + arrow::ipc::MakeFileWriter(output_file, + table->schema())); ARROW_RETURN_NOT_OK(batch_writer->WriteTable(*table)); ARROW_RETURN_NOT_OK(batch_writer->Close()); diff --git a/cpp/examples/parquet/parquet-stream-api/stream-reader-writer.cc b/cpp/examples/parquet/parquet-stream-api/stream-reader-writer.cc index 5a255bff275..64e44803ccf 100644 --- a/cpp/examples/parquet/parquet-stream-api/stream-reader-writer.cc +++ b/cpp/examples/parquet/parquet-stream-api/stream-reader-writer.cc @@ -48,7 +48,8 @@ class UserTimestamp { bool operator==(const UserTimestamp& x) const { return ts_ == x.ts_; } void dump(std::ostream& os) const { - std::time_t t{std::chrono::duration_cast(ts_).count()}; + const auto t = static_cast( + std::chrono::duration_cast(ts_).count()); os << std::put_time(std::gmtime(&t), "%Y%m%d-%H%M%S"); } diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 328553c93c3..dd17720595a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -180,6 +180,7 @@ set(ARROW_SRCS util/bitmap.cc util/bitmap_builders.cc util/bitmap_ops.cc + util/bpacking.cc util/compression.cc util/cpu_info.cc util/decimal.cc @@ -213,6 +214,20 @@ set(ARROW_SRCS vendored/double-conversion/diy-fp.cc vendored/double-conversion/strtod.cc) +if(ARROW_HAVE_RUNTIME_AVX2) + list(APPEND ARROW_SRCS util/bpacking_avx2.cc) + set_source_files_properties(util/bpacking_avx2.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON) + set_source_files_properties(util/bpacking_avx2.cc PROPERTIES COMPILE_FLAGS + ${ARROW_AVX2_FLAG}) +endif() +if(ARROW_HAVE_RUNTIME_AVX512) + list(APPEND ARROW_SRCS util/bpacking_avx512.cc) + set_source_files_properties(util/bpacking_avx512.cc PROPERTIES SKIP_PRECOMPILE_HEADERS + ON) + set_source_files_properties(util/bpacking_avx512.cc PROPERTIES COMPILE_FLAGS + ${ARROW_AVX512_FLAG}) +endif() + if(APPLE) list(APPEND ARROW_SRCS vendored/datetime/ios.mm) endif() @@ -350,6 +365,7 @@ if(ARROW_COMPUTE) compute/registry.cc compute/kernels/aggregate_basic.cc compute/kernels/aggregate_mode.cc + compute/kernels/aggregate_var_std.cc compute/kernels/codegen_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc @@ -371,18 +387,18 @@ if(ARROW_COMPUTE) compute/kernels/vector_selection.cc compute/kernels/vector_sort.cc) - if(CXX_SUPPORTS_AVX2) - list(APPEND ARROW_SRCS compute/kernels/aggregate_sum_avx2.cc) - set_source_files_properties(compute/kernels/aggregate_sum_avx2.cc PROPERTIES + if(ARROW_HAVE_RUNTIME_AVX2) + list(APPEND ARROW_SRCS compute/kernels/aggregate_basic_avx2.cc) + set_source_files_properties(compute/kernels/aggregate_basic_avx2.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON) - set_source_files_properties(compute/kernels/aggregate_sum_avx2.cc PROPERTIES + set_source_files_properties(compute/kernels/aggregate_basic_avx2.cc PROPERTIES COMPILE_FLAGS ${ARROW_AVX2_FLAG}) endif() - if(CXX_SUPPORTS_AVX512) - list(APPEND ARROW_SRCS compute/kernels/aggregate_sum_avx512.cc) - set_source_files_properties(compute/kernels/aggregate_sum_avx512.cc PROPERTIES + if(ARROW_HAVE_RUNTIME_AVX512) + list(APPEND ARROW_SRCS compute/kernels/aggregate_basic_avx512.cc) + set_source_files_properties(compute/kernels/aggregate_basic_avx512.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON) - set_source_files_properties(compute/kernels/aggregate_sum_avx512.cc PROPERTIES + set_source_files_properties(compute/kernels/aggregate_basic_avx512.cc PROPERTIES COMPILE_FLAGS ${ARROW_AVX512_FLAG}) endif() endif() @@ -391,9 +407,6 @@ if(ARROW_FILESYSTEM) if(ARROW_HDFS) add_definitions(-DARROW_HDFS) endif() - if(ARROW_S3) - add_definitions(-DARROW_S3) - endif() list(APPEND ARROW_SRCS filesystem/filesystem.cc @@ -448,7 +461,7 @@ if(ARROW_ORC) list(APPEND ARROW_SRCS adapters/orc/adapter.cc adapters/orc/adapter_util.cc) endif() -if(NOT APPLE AND NOT MSVC) +if(NOT APPLE AND NOT MSVC_TOOLCHAIN) # Localize thirdparty symbols using a linker version script. This hides them # from the client application. The OS X linker does not support the # version-script option. @@ -514,11 +527,7 @@ if(ARROW_BUILD_STATIC AND ARROW_BUNDLED_STATIC_LIBS) ${_OTHER_LIBS}) endif() -if(ARROW_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS - OR ARROW_BUILD_INTEGRATION - OR ARROW_FUZZING) - +if(ARROW_TESTING) # that depend on gtest add_arrow_lib(arrow_testing CMAKE_PACKAGE_NAME diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 0781dd4a2df..b2524afe4f8 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -73,6 +73,10 @@ struct ScalarFromArraySlotImpl { return Finish(Decimal128(a.GetValue(index_))); } + Status Visit(const Decimal256Array& a) { + return Finish(Decimal256(a.GetValue(index_))); + } + template Status Visit(const BaseBinaryArray& a) { return Finish(a.GetString(index_)); @@ -161,7 +165,13 @@ struct ScalarFromArraySlotImpl { } if (array_.IsNull(index_)) { - return MakeNullScalar(array_.type()); + auto null = MakeNullScalar(array_.type()); + if (is_dictionary(array_.type()->id())) { + auto& dict_null = checked_cast(*null); + const auto& dict_array = checked_cast(array_); + dict_null.value.dictionary = dict_array.dictionary(); + } + return null; } RETURN_NOT_OK(VisitArrayInline(array_, this)); diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 9c2cd888692..5c247a6dc66 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -25,8 +25,8 @@ #include #include "arrow/array.h" +#include "arrow/array/builder_binary.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" @@ -570,6 +570,26 @@ class TestStringBuilder : public TestBuilder { ASSERT_EQ(reps * 40, result_->value_data()->size()); } + void TestOverflowCheck() { + auto max_size = builder_->memory_limit(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_size)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size + 1)); + + ASSERT_OK(builder_->Append("bb")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->Append("ccc")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 4)); + } + void TestZeroLength() { // All buffers are null Done(); @@ -602,6 +622,8 @@ TYPED_TEST(TestStringBuilder, TestCapacityReserve) { this->TestCapacityReserve() TYPED_TEST(TestStringBuilder, TestZeroLength) { this->TestZeroLength(); } +TYPED_TEST(TestStringBuilder, TestOverflowCheck) { this->TestOverflowCheck(); } + // ---------------------------------------------------------------------- // ChunkedBinaryBuilder tests diff --git a/cpp/src/arrow/array/array_decimal.cc b/cpp/src/arrow/array/array_decimal.cc index 1e813f2e515..d65f6ee5356 100644 --- a/cpp/src/arrow/array/array_decimal.cc +++ b/cpp/src/arrow/array/array_decimal.cc @@ -33,11 +33,11 @@ namespace arrow { using internal::checked_cast; // ---------------------------------------------------------------------- -// Decimal +// Decimal128 Decimal128Array::Decimal128Array(const std::shared_ptr& data) : FixedSizeBinaryArray(data) { - ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL); + ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128); } std::string Decimal128Array::FormatValue(int64_t i) const { @@ -46,4 +46,18 @@ std::string Decimal128Array::FormatValue(int64_t i) const { return value.ToString(type_.scale()); } +// ---------------------------------------------------------------------- +// Decimal256 + +Decimal256Array::Decimal256Array(const std::shared_ptr& data) + : FixedSizeBinaryArray(data) { + ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256); +} + +std::string Decimal256Array::FormatValue(int64_t i) const { + const auto& type_ = checked_cast(*type()); + const Decimal256 value(GetValue(i)); + return value.ToString(type_.scale()); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_decimal.h b/cpp/src/arrow/array/array_decimal.h index 6d5e884118b..8d7d1c59cd0 100644 --- a/cpp/src/arrow/array/array_decimal.h +++ b/cpp/src/arrow/array/array_decimal.h @@ -47,4 +47,20 @@ class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { // Backward compatibility using DecimalArray = Decimal128Array; +// ---------------------------------------------------------------------- +// Decimal256Array + +/// Concrete Array class for 256-bit decimal data +class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal256Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal256Array from ArrayData instance + explicit Decimal256Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + } // namespace arrow diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index c87606f7caf..ce1f49ce5fa 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -75,6 +75,12 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& type, const std::shared_ptr& indices, const std::shared_ptr& dictionary); + static Result> FromArrays( + const std::shared_ptr& indices, const std::shared_ptr& dictionary) { + return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices, + dictionary); + } + /// \brief Transpose this DictionaryArray /// /// This method constructs a new dictionary array with the given dictionary diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 2ad31121d2a..fca442b2567 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -25,7 +25,9 @@ #include #include "arrow/array.h" -#include "arrow/builder.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" @@ -448,13 +450,14 @@ TEST(TestStringDictionaryBuilder, ArrayInit) { AssertArraysEqual(expected, *result); } -TEST(TestStringDictionaryBuilder, MakeBuilder) { - auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])"); - auto dict_type = dictionary(int8(), utf8()); +template +void TestStringDictionaryMakeBuilder(const std::shared_ptr& value_type) { + auto dict_array = ArrayFromJSON(value_type, R"(["test", "test2"])"); + auto dict_type = dictionary(int8(), value_type); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); std::unique_ptr boxed_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); - auto& builder = checked_cast(*boxed_builder); + auto& builder = checked_cast(*boxed_builder); // Build the dictionary Array ASSERT_OK(builder.Append("test")); @@ -470,6 +473,14 @@ TEST(TestStringDictionaryBuilder, MakeBuilder) { AssertArraysEqual(expected, *result); } +TEST(TestStringDictionaryBuilder, MakeBuilder) { + TestStringDictionaryMakeBuilder>(utf8()); +} + +TEST(TestLargeStringDictionaryBuilder, MakeBuilder) { + TestStringDictionaryMakeBuilder>(large_utf8()); +} + // ARROW-4367 TEST(TestStringDictionaryBuilder, OnlyNull) { // Build the dictionary Array @@ -814,27 +825,25 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) { ASSERT_TRUE(expected.Equals(result)); } -TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { +#ifndef NDEBUG +TEST(TestFixedSizeBinaryDictionaryBuilder, AppendArrayInvalidType) { // Build the dictionary Array - auto value_type = arrow::fixed_size_binary(4); + auto value_type = fixed_size_binary(4); DictionaryBuilder builder(value_type); // Build an array with different byte width - FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5)); - std::vector value{100, 1, 1, 1, 1}; - ASSERT_OK(fsb_builder.Append(value.data())); - std::shared_ptr fsb_array; - ASSERT_OK(fsb_builder.Finish(&fsb_array)); + auto fsb_array = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar"])"); - ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array)); + ASSERT_RAISES(TypeError, builder.AppendArray(*fsb_array)); } +#endif -TEST(TestDecimalDictionaryBuilder, Basic) { +template +void TestDecimalDictionaryBuilderBasic(std::shared_ptr decimal_type) { // Build the dictionary Array - auto decimal_type = arrow::decimal(2, 0); DictionaryBuilder builder(decimal_type); // Test data - std::vector test{12, 12, 11, 12}; + std::vector test{12, 12, 11, 12}; for (const auto& value : test) { ASSERT_OK(builder.Append(value.ToBytes().data())); } @@ -850,40 +859,48 @@ TEST(TestDecimalDictionaryBuilder, Basic) { ASSERT_TRUE(expected.Equals(result)); } -TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { - const auto& decimal_type = arrow::decimal(21, 0); +TEST(TestDecimal128DictionaryBuilder, Basic) { + TestDecimalDictionaryBuilderBasic(arrow::decimal128(2, 0)); +} + +TEST(TestDecimal256DictionaryBuilder, Basic) { + TestDecimalDictionaryBuilderBasic(arrow::decimal256(76, 0)); +} +void TestDecimalDictionaryBuilderDoubleTableSize( + std::shared_ptr decimal_type, FixedSizeBinaryBuilder& decimal_builder) { // Build the dictionary Array DictionaryBuilder dict_builder(decimal_type); // Build expected data - Decimal128Builder decimal_builder(decimal_type); Int16Builder int_builder; // Fill with 1024 different values for (int64_t i = 0; i < 1024; i++) { - const uint8_t bytes[] = {0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 12, - 12, - static_cast(i / 128), - static_cast(i % 128)}; + // Decimal256Builder takes 32 bytes, while Decimal128Builder takes only the first 16 + // bytes. + const uint8_t bytes[32] = {0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 12, + 12, + static_cast(i / 128), + static_cast(i % 128)}; ASSERT_OK(dict_builder.Append(bytes)); ASSERT_OK(decimal_builder.Append(bytes)); ASSERT_OK(int_builder.Append(static_cast(i))); } // Fill with an already existing value - const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1}; + const uint8_t known_value[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1}; for (int64_t i = 0; i < 1024; i++) { ASSERT_OK(dict_builder.Append(known_value)); ASSERT_OK(int_builder.Append(1)); @@ -904,6 +921,90 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { ASSERT_TRUE(expected.Equals(result)); } +TEST(TestDecimal128DictionaryBuilder, DoubleTableSize) { + const auto& decimal_type = arrow::decimal128(21, 0); + Decimal128Builder decimal_builder(decimal_type); + TestDecimalDictionaryBuilderDoubleTableSize(decimal_type, decimal_builder); +} + +TEST(TestDecimal256DictionaryBuilder, DoubleTableSize) { + const auto& decimal_type = arrow::decimal256(21, 0); + Decimal256Builder decimal_builder(decimal_type); + TestDecimalDictionaryBuilderDoubleTableSize(decimal_type, decimal_builder); +} + +TEST(TestNullDictionaryBuilder, Basic) { + // MakeBuilder + auto dict_type = dictionary(int8(), null()); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast&>(*boxed_builder); + + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.AppendNull()); + ASSERT_EQ(3, builder.length()); + ASSERT_EQ(3, builder.null_count()); + + ASSERT_OK(builder.AppendNulls(4)); + ASSERT_EQ(7, builder.length()); + ASSERT_EQ(7, builder.null_count()); + + auto null_array = ArrayFromJSON(null(), "[null, null, null, null]"); + ASSERT_OK(builder.AppendArray(*null_array)); + ASSERT_EQ(11, builder.length()); + ASSERT_EQ(11, builder.null_count()); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + AssertTypeEqual(*dict_type, *result->type()); + ASSERT_EQ(11, result->length()); + ASSERT_EQ(11, result->null_count()); +} + +#ifndef NDEBUG +TEST(TestNullDictionaryBuilder, AppendArrayInvalidType) { + // MakeBuilder + auto dict_type = dictionary(int8(), null()); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast&>(*boxed_builder); + + auto int8_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); + ASSERT_RAISES(TypeError, builder.AppendArray(*int8_array)); +} +#endif + +// ---------------------------------------------------------------------- +// Index byte width tests + +template +void AssertIndexByteWidth(const std::shared_ptr& value_type = + TypeTraits::type_singleton()) { + auto index_type = TypeTraits::type_singleton(); + auto dict_type = + checked_pointer_cast(dictionary(index_type, value_type)); + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &builder)); + auto builder_dict_type = checked_pointer_cast(builder->type()); + AssertTypeEqual(dict_type->index_type(), builder_dict_type->index_type()); +} + +typedef ::testing::Types IndexTypes; + +template +class TestDictionaryBuilderIndexByteWidth : public TestBuilder {}; + +TYPED_TEST_SUITE(TestDictionaryBuilderIndexByteWidth, IndexTypes); + +TYPED_TEST(TestDictionaryBuilderIndexByteWidth, MakeBuilder) { + AssertIndexByteWidth(); + AssertIndexByteWidth(); + AssertIndexByteWidth(); + AssertIndexByteWidth(fixed_size_binary(4)); + AssertIndexByteWidth(); +} + // ---------------------------------------------------------------------- // DictionaryArray tests diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index df0eb522cf4..1696653850b 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -23,8 +23,8 @@ #include #include "arrow/array.h" +#include "arrow/array/builder_nested.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" @@ -467,6 +467,32 @@ class TestListArray : public TestBuilder { AssertArraysEqual(*result_, *expected); } + void TestOverflowCheck() { + Int16Builder* vb = checked_cast(builder_->value_builder()); + auto max_elements = builder_->maximum_elements(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_elements)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 4)); + } + protected: std::shared_ptr value_type_; @@ -508,6 +534,12 @@ TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } +#ifndef ARROW_LARGE_MEMORY_TESTS +TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck(); } +#else +TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } +#endif + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 0afadcf9285..f54b43465e9 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -15,15 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include + #include #include #include #include -#include - #include "arrow/array.h" -#include "arrow/builder.h" +#include "arrow/array/builder_nested.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" @@ -266,10 +266,8 @@ TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_EQ(2, result_->field(1)->length()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_TRUE(result_->field(0)->IsNull(0)); - ASSERT_TRUE(result_->field(0)->IsNull(1)); - ASSERT_TRUE(result_->field(1)->IsNull(0)); - ASSERT_TRUE(result_->field(1)->IsNull(1)); + ASSERT_EQ(0, result_->field(0)->null_count()); + ASSERT_EQ(0, result_->field(1)->null_count()); ASSERT_EQ(Type::LIST, result_->field(0)->type_id()); ASSERT_EQ(Type::INT32, result_->field(1)->type_id()); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 2702c355c01..89087ee318c 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -16,11 +16,9 @@ // under the License. #include -#include #include #include #include -#include #include #include #include @@ -43,9 +41,7 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" -#include "arrow/builder.h" #include "arrow/compare.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -70,6 +66,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; class TestArray : public ::testing::Test { public: @@ -234,6 +231,9 @@ TEST_F(TestArray, SliceRecomputeNullCount) { slice = array->Slice(4); ASSERT_EQ(4, slice->null_count()); + auto slice2 = slice->Slice(0); + ASSERT_EQ(4, slice2->null_count()); + slice = array->Slice(0); ASSERT_EQ(5, slice->null_count()); @@ -425,6 +425,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), + std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), @@ -638,7 +639,7 @@ class TestPrimitiveBuilder : public TestBuilder { std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -763,7 +764,7 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - std::shared_ptr result = std::dynamic_pointer_cast(out); + std::shared_ptr result = checked_pointer_cast(out); ASSERT_EQ(ex_null_count, result->null_count()); ASSERT_EQ(size, result->length()); @@ -880,7 +881,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { std::shared_ptr out; FinishAndCheckPadding(this->builder_.get(), &out); - auto result = std::dynamic_pointer_cast(out); + auto result = checked_pointer_cast(out); for (int64_t i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -914,6 +915,33 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) { } } +TYPED_TEST(TestPrimitiveBuilder, TestAppendEmptyValue) { + ASSERT_OK(this->builder_->AppendNull()); + ASSERT_OK(this->builder_->AppendEmptyValue()); + ASSERT_OK(this->builder_->AppendNulls(2)); + ASSERT_OK(this->builder_->AppendEmptyValues(2)); + + std::shared_ptr out; + FinishAndCheckPadding(this->builder_.get(), &out); + ASSERT_OK(out->ValidateFull()); + + auto result = checked_pointer_cast(out); + ASSERT_EQ(result->length(), 6); + ASSERT_EQ(result->null_count(), 3); + + ASSERT_TRUE(result->IsNull(0)); + ASSERT_FALSE(result->IsNull(1)); + ASSERT_TRUE(result->IsNull(2)); + ASSERT_TRUE(result->IsNull(3)); + ASSERT_FALSE(result->IsNull(4)); + ASSERT_FALSE(result->IsNull(5)); + + // implementation detail: the value slots are 0-initialized + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(result->Value(i), 0); + } +} + TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { DECL_T(); @@ -1501,7 +1529,7 @@ void CheckFloatingNanEquality() { // NaN != non-NaN ArrayFromVector(type, {false, true}, {0.5, nan_value}, &a); - ArrayFromVector(type, {false, true}, {0.5, 0.0}, &a); + ArrayFromVector(type, {false, true}, {0.5, 0.0}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->Equals(b, EqualOptions().nans_equal(true))); @@ -1520,6 +1548,73 @@ void CheckFloatingNanEquality() { ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); } +template +void CheckFloatingInfinityEquality() { + std::shared_ptr a, b; + std::shared_ptr type = TypeTraits::type_singleton(); + + const auto infinity = std::numeric_limits::infinity(); + + for (auto nans_equal : {false, true}) { + // Infinity in a null entry + ArrayFromVector(type, {true, false}, {0.5, infinity}, &a); + ArrayFromVector(type, {true, false}, {0.5, -infinity}, &b); + ASSERT_TRUE(a->Equals(b)); + ASSERT_TRUE(b->Equals(a)); + ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_TRUE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_TRUE(a->RangeEquals(b, 0, 2, 0)); + ASSERT_TRUE(b->RangeEquals(a, 0, 2, 0)); + ASSERT_TRUE(a->RangeEquals(b, 1, 2, 1)); + ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); + + // Infinity in a valid entry + ArrayFromVector(type, {false, true}, {0.5, infinity}, &a); + ArrayFromVector(type, {false, true}, {0.5, infinity}, &b); + ASSERT_TRUE(a->Equals(b)); + ASSERT_TRUE(b->Equals(a)); + ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_TRUE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_TRUE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + // Infinity in tested range + ASSERT_TRUE(a->RangeEquals(b, 0, 2, 0)); + ASSERT_TRUE(b->RangeEquals(a, 0, 2, 0)); + ASSERT_TRUE(a->RangeEquals(b, 1, 2, 1)); + ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); + // Infinity not in tested range + ASSERT_TRUE(a->RangeEquals(b, 0, 1, 0)); + ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); + + // Infinity != non-infinity + ArrayFromVector(type, {false, true}, {0.5, -infinity}, &a); + ArrayFromVector(type, {false, true}, {0.5, 0.0}, &b); + ASSERT_FALSE(a->Equals(b)); + ASSERT_FALSE(b->Equals(a)); + ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_FALSE(b->ApproxEquals(a)); + ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_FALSE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + // Infinity != Negative infinity + ArrayFromVector(type, {true, true}, {0.5, -infinity}, &a); + ArrayFromVector(type, {true, true}, {0.5, infinity}, &b); + ASSERT_FALSE(a->Equals(b)); + ASSERT_FALSE(b->Equals(a)); + ASSERT_FALSE(a->ApproxEquals(b)); + ASSERT_FALSE(b->ApproxEquals(a)); + ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + ASSERT_FALSE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); + // Infinity in tested range + ASSERT_FALSE(a->RangeEquals(b, 0, 2, 0)); + ASSERT_FALSE(b->RangeEquals(a, 0, 2, 0)); + ASSERT_FALSE(a->RangeEquals(b, 1, 2, 1)); + ASSERT_FALSE(b->RangeEquals(a, 1, 2, 1)); + // Infinity not in tested range + ASSERT_TRUE(a->RangeEquals(b, 0, 1, 0)); + ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); + } +} + TEST(TestPrimitiveAdHoc, FloatingApproxEquals) { CheckApproxEquals(); CheckApproxEquals(); @@ -1535,6 +1630,11 @@ TEST(TestPrimitiveAdHoc, FloatingNanEquality) { CheckFloatingNanEquality(); } +TEST(TestPrimitiveAdHoc, FloatingInfinityEquality) { + CheckFloatingInfinityEquality(); + CheckFloatingInfinityEquality(); +} + // ---------------------------------------------------------------------- // FixedSizeBinary tests @@ -2029,6 +2129,31 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(int8(), "[null, null, 0, 42, 0, 0]")); +} + +TEST(TestAdaptiveIntBuilderWithStartIntSize, TestReset) { + auto builder = std::make_shared( + static_cast(sizeof(int16_t)), default_memory_pool()); + AssertTypeEqual(*int16(), *builder->type()); + + ASSERT_OK( + builder->Append(static_cast(std::numeric_limits::max()) + 1)); + AssertTypeEqual(*int32(), *builder->type()); + + builder->Reset(); + AssertTypeEqual(*int16(), *builder->type()); +} + class TestAdaptiveUIntBuilder : public TestBuilder { public: void SetUp() { @@ -2234,13 +2359,42 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) { } } +TEST_F(TestAdaptiveUIntBuilder, TestAppendEmptyValue) { + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->AppendEmptyValue()); + ASSERT_OK(builder_->Append(42)); + ASSERT_OK(builder_->AppendEmptyValues(2)); + Done(); + + ASSERT_OK(result_->ValidateFull()); + // NOTE: The fact that we get 0 is really an implementation detail + AssertArraysEqual(*result_, *ArrayFromJSON(uint8(), "[null, null, 0, 42, 0, 0]")); +} + +TEST(TestAdaptiveUIntBuilderWithStartIntSize, TestReset) { + auto builder = std::make_shared( + static_cast(sizeof(uint16_t)), default_memory_pool()); + AssertTypeEqual(uint16(), builder->type()); + + ASSERT_OK( + builder->Append(static_cast(std::numeric_limits::max()) + 1)); + AssertTypeEqual(uint32(), builder->type()); + + builder->Reset(); + AssertTypeEqual(uint16(), builder->type()); +} + // ---------------------------------------------------------------------- // Test Decimal arrays -using DecimalVector = std::vector; - +template class DecimalTest : public ::testing::TestWithParam { public: + using DecimalBuilder = typename TypeTraits::BuilderType; + using DecimalValue = typename TypeTraits::ScalarType::ValueType; + using DecimalArray = typename TypeTraits::ArrayType; + using DecimalVector = std::vector; + DecimalTest() {} template @@ -2256,8 +2410,8 @@ class DecimalTest : public ::testing::TestWithParam { template void TestCreate(int32_t precision, const DecimalVector& draw, const std::vector& valid_bytes, int64_t offset) const { - auto type = std::make_shared(precision, 4); - auto builder = std::make_shared(type); + auto type = std::make_shared(precision, 4); + auto builder = std::make_shared(type); size_t null_count = 0; @@ -2288,7 +2442,7 @@ class DecimalTest : public ::testing::TestWithParam { ASSERT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes)); int64_t expected_null_count = CountNulls(valid_bytes); - auto expected = std::make_shared( + auto expected = std::make_shared( type, size, expected_data, expected_null_bitmap, expected_null_count); std::shared_ptr lhs = out->Slice(offset); @@ -2297,7 +2451,9 @@ class DecimalTest : public ::testing::TestWithParam { } }; -TEST_P(DecimalTest, NoNulls) { +using Decimal128Test = DecimalTest; + +TEST_P(Decimal128Test, NoNulls) { int32_t precision = GetParam(); std::vector draw = {Decimal128(1), Decimal128(-2), Decimal128(2389), Decimal128(4), Decimal128(-12348)}; @@ -2306,7 +2462,7 @@ TEST_P(DecimalTest, NoNulls) { this->TestCreate(precision, draw, valid_bytes, 2); } -TEST_P(DecimalTest, WithNulls) { +TEST_P(Decimal128Test, WithNulls) { int32_t precision = GetParam(); std::vector draw = {Decimal128(1), Decimal128(2), Decimal128(-1), Decimal128(4), Decimal128(-1), Decimal128(1), @@ -2325,7 +2481,44 @@ TEST_P(DecimalTest, WithNulls) { this->TestCreate(precision, draw, valid_bytes, 2); } -INSTANTIATE_TEST_SUITE_P(DecimalTest, DecimalTest, ::testing::Range(1, 38)); +INSTANTIATE_TEST_SUITE_P(Decimal128Test, Decimal128Test, ::testing::Range(1, 38)); + +using Decimal256Test = DecimalTest; + +TEST_P(Decimal256Test, NoNulls) { + int32_t precision = GetParam(); + std::vector draw = {Decimal256(1), Decimal256(-2), Decimal256(2389), + Decimal256(4), Decimal256(-12348)}; + std::vector valid_bytes = {true, true, true, true, true}; + this->TestCreate(precision, draw, valid_bytes, 0); + this->TestCreate(precision, draw, valid_bytes, 2); +} + +TEST_P(Decimal256Test, WithNulls) { + int32_t precision = GetParam(); + std::vector draw = {Decimal256(1), Decimal256(2), Decimal256(-1), + Decimal256(4), Decimal256(-1), Decimal256(1), + Decimal256(2)}; + Decimal256 big; // (pow(2, 255) - 1) / pow(10, 38) + ASSERT_OK_AND_ASSIGN(big, + Decimal256::FromString("578960446186580977117854925043439539266." + "34992332820282019728792003956564819967")); + draw.push_back(big); + + Decimal256 big_negative; // -pow(2, 255) / pow(10, 38) + ASSERT_OK_AND_ASSIGN(big_negative, + Decimal256::FromString("-578960446186580977117854925043439539266." + "34992332820282019728792003956564819968")); + draw.push_back(big_negative); + + std::vector valid_bytes = {true, true, false, true, false, + true, true, true, true}; + this->TestCreate(precision, draw, valid_bytes, 0); + this->TestCreate(precision, draw, valid_bytes, 2); +} + +INSTANTIATE_TEST_SUITE_P(Decimal256Test, Decimal256Test, + ::testing::Values(1, 2, 5, 10, 38, 39, 40, 75, 76)); // ---------------------------------------------------------------------- // Test rechunking diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index a32b8b868de..1eb722b13c5 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -20,6 +20,8 @@ #include #include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_union.h" // TODO ipc shouldn't be included here #include "arrow/ipc/test_common.h" #include "arrow/testing/gtest_util.h" @@ -307,7 +309,24 @@ class UnionBuilderTest : public ::testing::Test { AppendString("def"); AppendInt(-10); AppendDouble(0.5); + + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector(expected_types_vector, &expected_types); + } + + void AppendNullsAndEmptyValues() { + AppendString("abc"); + ASSERT_OK(union_builder->AppendNull()); + ASSERT_OK(union_builder->AppendEmptyValue()); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + AppendInt(42); + ASSERT_OK(union_builder->AppendNulls(2)); + ASSERT_OK(union_builder->AppendEmptyValues(2)); + expected_types_vector.insert(expected_types_vector.end(), 3, I8); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -329,7 +348,9 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(1.0); AppendDouble(-1.0); AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); ASSERT_EQ(I8, 0); @@ -357,6 +378,7 @@ class UnionBuilderTest : public ::testing::Test { AppendDouble(0.5); ASSERT_OK(list_builder.Finish(actual)); + ASSERT_OK((*actual)->ValidateFull()); ArrayFromVector(expected_types_vector, &expected_types); } @@ -376,20 +398,20 @@ class SparseUnionBuilderTest : public UnionBuilderTest { void AppendInt(int8_t i) override { Base::AppendInt(i); - ASSERT_OK(str_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(str_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendString(const std::string& str) override { Base::AppendString(str); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(dbl_builder->AppendEmptyValue()); } void AppendDouble(double dbl) override { Base::AppendDouble(dbl); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(i8_builder->AppendEmptyValue()); + ASSERT_OK(str_builder->AppendEmptyValue()); } }; @@ -415,6 +437,34 @@ TEST_F(DenseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(DenseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new DenseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // Four null / empty values (the latter implementation-defined) were appended to I8 + auto expected_i8 = ArrayFromJSON(int8(), "[null, 0, 42, null, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[]"); + + // "abc", null, 0, 42, null, null, 0, 0 + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 2, 3, 3, 4, 4]"); + + ASSERT_OK_AND_ASSIGN(auto expected, + DenseUnionArray::Make(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(DenseUnionBuilderTest, InferredType) { AppendInferred(); @@ -467,6 +517,32 @@ TEST_F(SparseUnionBuilderTest, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(SparseUnionBuilderTest, NullsAndEmptyValues) { + union_builder.reset(new SparseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + sparse_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); + AppendNullsAndEmptyValues(); + + // "abc", null, 0, 42, null, null, 0, 0 + // (note that getting 0 for empty values is implementation-defined) + auto expected_i8 = ArrayFromJSON(int8(), "[0, null, 0, 42, null, null, 0, 0]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "", "", "", "", "", ""])"); + auto expected_dbl = ArrayFromJSON(float64(), "[0, 0, 0, 0, 0, 0, 0, 0]"); + + ASSERT_OK_AND_ASSIGN( + auto expected, + SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL})); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); + // Physical arrays must be as expected + ASSERT_ARRAYS_EQUAL(*expected_i8, *actual->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_str, *actual->field(1)); + ASSERT_ARRAYS_EQUAL(*expected_dbl, *actual->field(2)); +} + TEST_F(SparseUnionBuilderTest, InferredType) { AppendInferred(); diff --git a/cpp/src/arrow/array/builder_adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc index 47880e91663..36e5546a749 100644 --- a/cpp/src/arrow/array/builder_adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -33,7 +33,8 @@ namespace arrow { using internal::AdaptiveIntBuilderBase; -AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) : ArrayBuilder(pool) {} +AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool) + : ArrayBuilder(pool), start_int_size_(start_int_size), int_size_(start_int_size) {} void AdaptiveIntBuilderBase::Reset() { ArrayBuilder::Reset(); @@ -41,7 +42,7 @@ void AdaptiveIntBuilderBase::Reset() { raw_data_ = nullptr; pending_pos_ = 0; pending_has_nulls_ = false; - int_size_ = sizeof(uint8_t); + int_size_ = start_int_size_; } Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { @@ -124,7 +125,8 @@ std::shared_ptr AdaptiveIntBuilder::type() const { return nullptr; } -AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} +AdaptiveIntBuilder::AdaptiveIntBuilder(uint8_t start_int_size, MemoryPool* pool) + : AdaptiveIntBuilderBase(start_int_size, pool) {} Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(CommitPendingData()); @@ -264,8 +266,8 @@ Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) { return Status::OK(); } -AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool) - : AdaptiveIntBuilderBase(pool) {} +AdaptiveUIntBuilder::AdaptiveUIntBuilder(uint8_t start_int_size, MemoryPool* pool) + : AdaptiveIntBuilderBase(start_int_size, pool) {} Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(CommitPendingData()); diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 1bce339433d..c0df797256d 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -35,7 +35,10 @@ namespace internal { class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); + AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool); + + explicit AdaptiveIntBuilderBase(MemoryPool* pool) + : AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {} /// \brief Append multiple nulls /// \param[in] length the number of nulls to append @@ -61,6 +64,26 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNotNull(length); + return Status::OK(); + } + + Status AppendEmptyValue() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + void Reset() override; Status Resize(int64_t capacity) override; @@ -88,7 +111,9 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { std::shared_ptr data_; uint8_t* raw_data_ = NULLPTR; - uint8_t int_size_ = sizeof(uint8_t); + + const uint8_t start_int_size_; + uint8_t int_size_; static constexpr int32_t pending_size_ = 1024; uint8_t pending_valid_[pending_size_]; @@ -101,7 +126,11 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()); + explicit AdaptiveUIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool()); + + explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()) + : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {} using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; @@ -135,7 +164,11 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { public: - explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool()); + explicit AdaptiveIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool()); + + explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool()) + : AdaptiveIntBuilder(sizeof(uint8_t), pool) {} using ArrayBuilder::Advance; using internal::AdaptiveIntBuilderBase::Reset; diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 4c21859fae3..b92cc285894 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -29,6 +29,22 @@ namespace arrow { +Status ArrayBuilder::CheckArrayType(const std::shared_ptr& expected_type, + const Array& array, const char* message) { + if (!expected_type->Equals(*array.type())) { + return Status::TypeError(message); + } + return Status::OK(); +} + +Status ArrayBuilder::CheckArrayType(Type::type expected_type, const Array& array, + const char* message) { + if (array.type_id() != expected_type) { + return Status::TypeError(message); + } + return Status::OK(); +} + Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { if (buffer) { if (bytes_filled < buffer->size()) { @@ -83,6 +99,12 @@ Status ArrayBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } +Result> ArrayBuilder::Finish() { + std::shared_ptr out; + RETURN_NOT_OK(Finish(&out)); + return out; +} + void ArrayBuilder::Reset() { capacity_ = length_ = null_count_ = 0; null_bitmap_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 8d327b713b6..15c726241b5 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -29,7 +29,7 @@ #include "arrow/buffer.h" #include "arrow/buffer_builder.h" #include "arrow/status.h" -#include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -56,6 +56,8 @@ class ARROW_EXPORT ArrayBuilder { /// skip shared pointers and just return a raw pointer ArrayBuilder* child(int i) { return children_[i].get(); } + const std::shared_ptr& child_builder(int i) const { return children_[i]; } + int num_children() const { return static_cast(children_.size()); } virtual int64_t length() const { return length_; } @@ -95,9 +97,25 @@ class ARROW_EXPORT ArrayBuilder { /// Reset the builder. virtual void Reset(); + /// \brief Append a null value to builder virtual Status AppendNull() = 0; + /// \brief Append a number of null values to builder virtual Status AppendNulls(int64_t length) = 0; + /// \brief Append a non-null value to builder + /// + /// The appended value is an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending a null value to a parent nested type. + virtual Status AppendEmptyValue() = 0; + + /// \brief Append a number of non-null values to builder + /// + /// The appended values are an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending null values to a parent nested type. + virtual Status AppendEmptyValues(int64_t length) = 0; + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. @@ -118,6 +136,13 @@ class ARROW_EXPORT ArrayBuilder { /// \return Status Status Finish(std::shared_ptr* out); + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \return The finalized Array object + Result> Finish(); + /// \brief Return the type of the built Array virtual std::shared_ptr type() const = 0; @@ -200,6 +225,12 @@ class ARROW_EXPORT ArrayBuilder { return Status::OK(); } + // Check for array type + Status CheckArrayType(const std::shared_ptr& expected_type, + const Array& array, const char* message); + Status CheckArrayType(Type::type expected_type, const Array& array, + const char* message); + MemoryPool* pool_; TypedBufferBuilder null_bitmap_builder_; @@ -216,4 +247,24 @@ class ARROW_EXPORT ArrayBuilder { ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the data type to create the builder for +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +/// \brief Construct an empty DictionaryBuilder initialized optionally +/// with a pre-existing dictionary +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the dictionary type to create the builder for +/// \param[in] dictionary the initial dictionary, if any. May be nullptr +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& dictionary, + std::unique_ptr* out); + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index ecb0e95fb44..6822dc89903 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -73,6 +73,20 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status FixedSizeBinaryBuilder::AppendEmptyValue() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); + return Status::OK(); +} + +Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0); + return Status::OK(); +} + void FixedSizeBinaryBuilder::Reset() { ArrayBuilder::Reset(); byte_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 593b533a19c..bc49c7d6787 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -61,6 +61,7 @@ class BaseBinaryBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Safety check for UBSAN. if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); } @@ -78,9 +79,6 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -96,6 +94,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + /// \brief Append without checking capacity /// /// Offsets and data should have been presized using Reserve() and @@ -232,6 +247,16 @@ class BaseBinaryBuilder : public ArrayBuilder { value_data_builder_.Reset(); } + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + Status Resize(int64_t capacity) override { // XXX Why is this check necessary? There is no reason to disallow, say, // binary arrays with more than 2**31 empty or null values. @@ -249,12 +274,8 @@ class BaseBinaryBuilder : public ArrayBuilder { /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements) { - const int64_t size = value_data_length() + elements; - ARROW_RETURN_IF(size > memory_limit(), - Status::CapacityError("Cannot reserve capacity larger than ", - memory_limit(), " bytes")); - return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) - : Status::OK(); + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return value_data_builder_.Reserve(elements); } Status FinishInternal(std::shared_ptr* out) override { @@ -317,16 +338,8 @@ class BaseBinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendOverflow(int64_t num_bytes) { - return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", num_bytes); - } - Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } return offsets_builder_.Append(static_cast(num_bytes)); } @@ -440,9 +453,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { const uint8_t* valid_bytes = NULLPTR); Status AppendNull() final; - Status AppendNulls(int64_t length) final; + Status AppendEmptyValue() final; + Status AppendEmptyValues(int64_t length) final; + void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { @@ -450,6 +465,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { } } + void UnsafeAppend(const char* value) { + UnsafeAppend(reinterpret_cast(value)); + } + void UnsafeAppend(util::string_view value) { #ifndef NDEBUG CheckValueSize(static_cast(value.size())); @@ -462,6 +481,23 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); } + Status ValidateOverflow(int64_t new_bytes) const { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return byte_builder_.Reserve(elements); + } + void Reset() override; Status Resize(int64_t capacity) override; Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc index ea5c9ebd0c3..bd7615a7309 100644 --- a/cpp/src/arrow/array/builder_decimal.cc +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -67,4 +67,39 @@ Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +// ---------------------------------------------------------------------- +// Decimal256Builder + +Decimal256Builder::Decimal256Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool), + decimal_type_(internal::checked_pointer_cast(type)) {} + +Status Decimal256Builder::Append(const Decimal256& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + UnsafeAppend(value); + return Status::OK(); +} + +void Decimal256Builder::UnsafeAppend(const Decimal256& value) { + value.ToBytes(GetMutableValue(length())); + byte_builder_.UnsafeAdvance(32); + UnsafeAppendToBitmap(true); +} + +void Decimal256Builder::UnsafeAppend(util::string_view value) { + FixedSizeBinaryBuilder::UnsafeAppend(value); +} + +Status Decimal256Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h index 8f0ff83288c..8c75e7dd674 100644 --- a/cpp/src/arrow/array/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -58,6 +58,35 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { std::shared_ptr decimal_type_; }; +class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal256Type; + + explicit Decimal256Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool()); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal256& val); + void UnsafeAppend(const Decimal256& val); + void UnsafeAppend(util::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + using DecimalBuilder = Decimal128Builder; } // namespace arrow diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 54fd94856ea..b13f6a2db34 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -18,6 +18,7 @@ #include "arrow/array/builder_dict.h" #include +#include #include "arrow/array/dict_internal.h" #include "arrow/status.h" @@ -44,7 +45,7 @@ class DictionaryMemoTable::DictionaryMemoTableImpl { template enable_if_no_memoize Visit(const T&) { - return Status::NotImplemented("Initialization of ", value_type_, + return Status::NotImplemented("Initialization of ", value_type_->ToString(), " memo table is not implemented"); } @@ -68,21 +69,20 @@ class DictionaryMemoTable::DictionaryMemoTableImpl { } private: - template - enable_if_no_memoize InsertValues(const DType& type, - const ArrayType&) { + template + enable_if_no_memoize InsertValues(const T& type, const ArrayType&) { return Status::NotImplemented("Inserting array values of ", type, " is not implemented"); } - template - enable_if_memoize InsertValues(const DType&, const ArrayType& array) { + template + enable_if_memoize InsertValues(const T&, const ArrayType& array) { if (array.null_count() > 0) { return Status::Invalid("Cannot insert dictionary values containing nulls"); } for (int64_t i = 0; i < array.length(); ++i) { int32_t unused_memo_index; - RETURN_NOT_OK(impl_->GetOrInsert(array.GetView(i), &unused_memo_index)); + RETURN_NOT_OK(impl_->GetOrInsert(array.GetView(i), &unused_memo_index)); } return Status::OK(); } @@ -112,8 +112,8 @@ class DictionaryMemoTable::DictionaryMemoTableImpl { }; public: - DictionaryMemoTableImpl(MemoryPool* pool, const std::shared_ptr& type) - : pool_(pool), type_(type), memo_table_(nullptr) { + DictionaryMemoTableImpl(MemoryPool* pool, std::shared_ptr type) + : pool_(pool), type_(std::move(type)), memo_table_(nullptr) { MemoTableInitializer visitor{type_, pool_, &memo_table_}; ARROW_CHECK_OK(VisitTypeInline(*type_, &visitor)); } @@ -127,9 +127,10 @@ class DictionaryMemoTable::DictionaryMemoTableImpl { return VisitTypeInline(*array.type(), &visitor); } - template - Status GetOrInsert(const T& value, int32_t* out) { - using ConcreteMemoTable = typename DictionaryCTraits::MemoTableType; + template ::type> + Status GetOrInsert(CType value, int32_t* out) { + using ConcreteMemoTable = typename DictionaryTraits::MemoTableType; return checked_cast(memo_table_.get())->GetOrInsert(value, out); } @@ -158,9 +159,10 @@ DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool, DictionaryMemoTable::~DictionaryMemoTable() = default; -#define GET_OR_INSERT(C_TYPE) \ - Status DictionaryMemoTable::GetOrInsert(C_TYPE value, int32_t* out) { \ - return impl_->GetOrInsert(value, out); \ +#define GET_OR_INSERT(C_TYPE) \ + Status DictionaryMemoTable::GetOrInsert( \ + const typename CTypeTraits::ArrowType*, C_TYPE value, int32_t* out) { \ + return impl_->GetOrInsert::ArrowType>(value, out); \ } GET_OR_INSERT(bool) @@ -174,10 +176,19 @@ GET_OR_INSERT(uint32_t) GET_OR_INSERT(uint64_t) GET_OR_INSERT(float) GET_OR_INSERT(double) -GET_OR_INSERT(util::string_view) #undef GET_OR_INSERT +Status DictionaryMemoTable::GetOrInsert(const BinaryType*, util::string_view value, + int32_t* out) { + return impl_->GetOrInsert(value, out); +} + +Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, util::string_view value, + int32_t* out) { + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetArrayData(int64_t start_offset, std::shared_ptr* out) { return impl_->GetArrayData(start_offset, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 9a0f268c4d0..40d6ce1ba9a 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -32,6 +32,8 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -42,24 +44,24 @@ namespace arrow { namespace internal { -template -struct DictionaryScalar { +template +struct DictionaryValue { using type = typename T::c_type; + using PhysicalType = T; }; -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { +template +struct DictionaryValue> { using type = util::string_view; + using PhysicalType = + typename std::conditional::value, + BinaryType, LargeBinaryType>::type; }; -template <> -struct DictionaryScalar { +template +struct DictionaryValue> { using type = util::string_view; + using PhysicalType = BinaryType; }; class ARROW_EXPORT DictionaryMemoTable { @@ -68,19 +70,6 @@ class ARROW_EXPORT DictionaryMemoTable { DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& dictionary); ~DictionaryMemoTable(); - Status GetOrInsert(bool value, int32_t* out); - Status GetOrInsert(int8_t value, int32_t* out); - Status GetOrInsert(int16_t value, int32_t* out); - Status GetOrInsert(int32_t value, int32_t* out); - Status GetOrInsert(int64_t value, int32_t* out); - Status GetOrInsert(uint8_t value, int32_t* out); - Status GetOrInsert(uint16_t value, int32_t* out); - Status GetOrInsert(uint32_t value, int32_t* out); - Status GetOrInsert(uint64_t value, int32_t* out); - Status GetOrInsert(float value, int32_t* out); - Status GetOrInsert(double value, int32_t* out); - Status GetOrInsert(util::string_view value, int32_t* out); - Status GetArrayData(int64_t start_offset, std::shared_ptr* out); /// \brief Insert new memo values @@ -88,7 +77,31 @@ class ARROW_EXPORT DictionaryMemoTable { int32_t size() const; + template + Status GetOrInsert(typename DictionaryValue::type value, int32_t* out) { + // We want to keep the DictionaryMemoTable implementation private, also we can't + // use extern template classes because of compiler issues (MinGW?). Instead, + // we expose explicit function overrides for each supported physical type. + const typename DictionaryValue::PhysicalType* physical_type = NULLPTR; + return GetOrInsert(physical_type, value, out); + } + private: + Status GetOrInsert(const BooleanType*, bool value, int32_t* out); + Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out); + Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out); + Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out); + Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out); + Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out); + Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out); + Status GetOrInsert(const FloatType*, float value, int32_t* out); + Status GetOrInsert(const DoubleType*, double value, int32_t* out); + + Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out); + Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out); + class DictionaryMemoTableImpl; std::unique_ptr impl_; }; @@ -101,15 +114,30 @@ class ARROW_EXPORT DictionaryMemoTable { template class DictionaryBuilderBase : public ArrayBuilder { public: - using Scalar = typename DictionaryScalar::type; + using TypeClass = DictionaryType; + using Value = typename DictionaryValue::type; // WARNING: the type given below is the value type, not the DictionaryType. // The DictionaryType is instantiated on the Finish() call. - template - DictionaryBuilderBase(enable_if_t::value, + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + !is_fixed_size_binary_type::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(start_int_size, pool), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_t::value, const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), @@ -117,6 +145,20 @@ class DictionaryBuilderBase : public ArrayBuilder { indices_builder_(pool), value_type_(value_type) {} + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + is_fixed_size_binary_type::value, + const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(start_int_size, pool), + value_type_(value_type) {} + template explicit DictionaryBuilderBase( enable_if_fixed_size_binary&> value_type, @@ -134,8 +176,8 @@ class DictionaryBuilderBase : public ArrayBuilder { : DictionaryBuilderBase(TypeTraits::type_singleton(), pool) {} // This constructor doesn't check for errors. Use InsertMemoValues instead. - DictionaryBuilderBase(const std::shared_ptr& dictionary, - MemoryPool* pool = default_memory_pool()) + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), delta_offset_(0), @@ -148,12 +190,18 @@ class DictionaryBuilderBase : public ArrayBuilder { /// \brief The current number of entries in the dictionary int64_t dictionary_length() const { return memo_table_->size(); } + /// \brief The value byte width (for FixedSizeBinaryType) + template + enable_if_fixed_size_binary byte_width() const { + return byte_width_; + } + /// \brief Append a scalar value - Status Append(const Scalar& value) { + Status Append(Value value) { ARROW_RETURN_NOT_OK(Reserve(1)); int32_t memo_index; - ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert(value, &memo_index)); + ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert(value, &memo_index)); ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index)); length_ += 1; @@ -190,6 +238,22 @@ class DictionaryBuilderBase : public ArrayBuilder { return Append(util::string_view(value, length)); } + /// \brief Append a decimal (only for Decimal128Type) + template + enable_if_decimal128 Append(const Decimal128& value) { + uint8_t data[16]; + value.ToBytes(data); + return Append(data, 16); + } + + /// \brief Append a decimal (only for Decimal128Type) + template + enable_if_decimal256 Append(const Decimal256& value) { + uint8_t data[32]; + value.ToBytes(data); + return Append(data, 32); + } + /// \brief Append a scalar null value Status AppendNull() final { length_ += 1; @@ -205,6 +269,18 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + /// \brief Insert values into the dictionary's memo, but do not append any /// indices. Can be used to initialize a new builder with known dictionary /// values @@ -220,6 +296,11 @@ class DictionaryBuilderBase : public ArrayBuilder { const Array& array) { using ArrayType = typename TypeTraits::ArrayType; +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif + const auto& concrete_array = static_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { @@ -233,10 +314,10 @@ class DictionaryBuilderBase : public ArrayBuilder { template enable_if_fixed_size_binary AppendArray(const Array& array) { - if (!value_type_->Equals(*array.type())) { - return Status::Invalid( - "Cannot append FixedSizeBinary array with non-matching type"); - } +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif const auto& concrete_array = static_cast(array); for (int64_t i = 0; i < array.length(); i++) { @@ -335,15 +416,30 @@ class DictionaryBuilderBase : public ArrayBuilder { template class DictionaryBuilderBase : public ArrayBuilder { public: - DictionaryBuilderBase(const std::shared_ptr& value_type, - MemoryPool* pool = default_memory_pool()) + template + DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} + template + explicit DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} - DictionaryBuilderBase(const std::shared_ptr& dictionary, - MemoryPool* pool = default_memory_pool()) + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} /// \brief Append a scalar null value @@ -361,8 +457,24 @@ class DictionaryBuilderBase : public ArrayBuilder { return indices_builder_.AppendNulls(length); } + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + /// \brief Append a whole dense array to the builder Status AppendArray(const Array& array) { +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + Type::NA, array, "Wrong value type of array to be appended")); +#endif for (int64_t i = 0; i < array.length(); i++) { ARROW_RETURN_NOT_OK(AppendNull()); } diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index b8af62fab14..a3bcde0381a 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -54,6 +54,18 @@ MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr& ke : MapBuilder(pool, key_builder, item_builder, map(key_builder->type(), item_builder->type(), keys_sorted)) {} +MapBuilder::MapBuilder(MemoryPool* pool, + const std::shared_ptr& struct_builder, + const std::shared_ptr& type) + : ArrayBuilder(pool) { + auto map_type = internal::checked_cast(type.get()); + keys_sorted_ = map_type->keys_sorted(); + key_builder_ = struct_builder->child_builder(0); + item_builder_ = struct_builder->child_builder(1); + list_builder_ = + std::make_shared(pool, struct_builder, struct_builder->type()); +} + Status MapBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(list_builder_->Resize(capacity)); capacity_ = list_builder_->capacity(); @@ -111,6 +123,24 @@ Status MapBuilder::AppendNulls(int64_t length) { return Status::OK(); } +Status MapBuilder::AppendEmptyValue() { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendEmptyValue()); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + +Status MapBuilder::AppendEmptyValues(int64_t length) { + DCHECK_EQ(item_builder_->length(), key_builder_->length()); + RETURN_NOT_OK(AdjustStructBuilderLength()); + RETURN_NOT_OK(list_builder_->AppendEmptyValues(length)); + length_ = list_builder_->length(); + null_count_ = list_builder_->null_count(); + return Status::OK(); +} + Status MapBuilder::AdjustStructBuilderLength() { // If key/item builders have been appended, adjust struct builder length // to match. Struct and key are non-nullable, append all valid values. @@ -170,6 +200,31 @@ Status FixedSizeListBuilder::AppendNulls(int64_t length) { return value_builder_->AppendNulls(list_size_ * length); } +Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { + auto new_length = value_builder_->length() + new_elements; + if (new_elements != list_size_) { + return Status::Invalid("Length of item not correct: expected ", list_size_, + " but got array of size ", new_elements); + } + if (new_length > maximum_elements()) { + return Status::CapacityError("array cannot contain more than ", maximum_elements(), + " elements, have ", new_elements); + } + return Status::OK(); +} + +Status FixedSizeListBuilder::AppendEmptyValue() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return value_builder_->AppendEmptyValues(list_size_); +} + +Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return value_builder_->AppendEmptyValues(list_size_ * length); +} + Status FixedSizeListBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity)); return ArrayBuilder::Resize(capacity); @@ -207,15 +262,6 @@ void StructBuilder::Reset() { } } -Status StructBuilder::AppendNulls(int64_t length) { - for (const auto& field : children_) { - RETURN_NOT_OK(field->AppendNulls(length)); - } - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(length, false); - return Status::OK(); -} - Status StructBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index cd6fadfcc2f..12b999b786e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -100,7 +100,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, false); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { @@ -109,6 +109,19 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { return Append(true); } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); + UnsafeAppendToBitmap(length, true); + const int64_t num_values = value_builder_->length(); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_values)); + } + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); @@ -131,6 +144,16 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status ValidateOverflow(int64_t new_elements) const { + auto new_length = value_builder_->length() + new_elements; + if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { + return Status::CapacityError("List array cannot contain more than ", + maximum_elements(), " elements, have ", new_elements); + } else { + return Status::OK(); + } + } + ArrayBuilder* value_builder() const { return value_builder_.get(); } // Cannot make this a static attribute because of linking issues @@ -147,17 +170,8 @@ class BaseListBuilder : public ArrayBuilder { std::shared_ptr value_builder_; std::shared_ptr value_field_; - Status CheckNextOffset() const { - const int64_t num_values = value_builder_->length(); - ARROW_RETURN_IF( - num_values > maximum_elements(), - Status::CapacityError("List array cannot contain more than ", maximum_elements(), - " child elements,", " have ", num_values)); - return Status::OK(); - } - Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); const int64_t num_values = value_builder_->length(); return offsets_builder_.Append(static_cast(num_values)); } @@ -227,6 +241,9 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, const std::shared_ptr& item_builder, bool keys_sorted = false); + MapBuilder(MemoryPool* pool, const std::shared_ptr& item_builder, + const std::shared_ptr& type); + Status Resize(int64_t capacity) override; void Reset() override; Status FinishInternal(std::shared_ptr* out) override; @@ -254,6 +271,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final; + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + /// \brief Get builder to append keys. /// /// Append a key with this builder should be followed by appending @@ -276,6 +297,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { return map(key_builder_->type(), item_builder_->type(), keys_sorted_); } + Status ValidateOverflow(int64_t new_elements) { + return list_builder_->ValidateOverflow(new_elements); + } + protected: inline Status AdjustStructBuilderLength(); @@ -343,12 +368,23 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { /// automatically. Status AppendNulls(int64_t length) final; + Status ValidateOverflow(int64_t new_elements); + + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); } + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + protected: std::shared_ptr value_field_; const int32_t list_size_; @@ -395,18 +431,41 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a null value. Automatically appends a null to each child + /// \brief Append a null value. Automatically appends an empty value to each child /// builder. Status AppendNull() final { for (const auto& field : children_) { - ARROW_RETURN_NOT_OK(field->AppendNull()); + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); } return Append(false); } - /// \brief Append multiple null values. Automatically appends nulls to each + /// \brief Append multiple null values. Automatically appends empty values to each /// child builder. - Status AppendNulls(int64_t length) final; + Status AppendNulls(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(true); + } + + Status AppendEmptyValues(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } void Reset() override; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e6b1baa5879..e10f11fdd6c 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -31,6 +31,9 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} + explicit NullBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool()) + : NullBuilder(pool) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { @@ -43,6 +46,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { return AppendNulls(1); } + Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); } + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status Append(std::nullptr_t) { return AppendNull(); } Status FinishInternal(std::shared_ptr* out) override; @@ -97,6 +104,22 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a empty element + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNotNull(length); + return Status::OK(); + } + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } void Reset() override { data_builder_.Reset(); } @@ -294,6 +317,20 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(false); + UnsafeSetNotNull(1); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNotNull(length); + return Status::OK(); + } + /// Scalar append Status Append(const bool val) { ARROW_RETURN_NOT_OK(Reserve(1)); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 1ccc7ef159f..060be474fb8 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -117,6 +117,26 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return child_builder->AppendNull(); } + Status AppendEmptyValue() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append an empty value arbitrarily to the first child + return child_builder->AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single empty value to the first child + return child_builder->AppendEmptyValue(); + } + /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// @@ -159,23 +179,45 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { const std::shared_ptr& type) : BasicUnionBuilder(pool, children, type) {} - /// \brief Append a null value. A null is added automatically to all the - /// children but the type id in the slot will be 0 + /// \brief Append a null value. + /// + /// A null is appended to the first child, empty values to the other children. Status AppendNull() final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull()); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue()); + } + return Status::OK(); + } + + /// \brief Append multiple null values. + /// + /// Nulls are appended to the first child, empty values to the other children. + Status AppendNulls(int64_t length) final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length)); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK( + type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNull()); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue()); } return Status::OK(); } - /// \brief Append multiple null values. Nulls will be automatically appended - /// to all the children but the type ids will be all 0. - Status AppendNulls(int64_t length) final { + Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); - // Append nulls to children for (int8_t code : type_codes_) { - ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendNulls(length)); + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); } return Status::OK(); } @@ -186,7 +228,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// \param[in] next_type type_id of the child to which the next value will be appended. /// /// The corresponding child builder must be appended to independently after this method - /// is called, and all other child builders must have null appended + /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } }; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 6989d7a1a37..30eeeee2a2d 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -70,7 +70,9 @@ static Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* std::shared_ptr* out) { int64_t out_length = 0; for (const auto& bitmap : bitmaps) { - out_length += bitmap.range.length; + if (internal::AddWithOverflow(out_length, bitmap.range.length, &out_length)) { + return Status::Invalid("Length overflow when concatenating arrays"); + } } ARROW_ASSIGN_OR_RAISE(*out, AllocateBitmap(out_length, pool)); uint8_t* dst = (*out)->mutable_data(); @@ -86,10 +88,6 @@ static Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* bitmap_offset += bitmap.range.length; } - // finally (if applicable) zero out any trailing bits - if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_length % 8]) { - dst[out_length / 8] &= preceding_bits; - } return Status::OK(); } @@ -203,7 +201,7 @@ class ConcatenateImpl { } Status Visit(const FixedWidthType& fixed) { - // Handles numbers, decimal128, fixed_size_binary + // Handles numbers, decimal128, decimal256, fixed_size_binary ARROW_ASSIGN_OR_RAISE(auto buffers, Buffers(1, fixed)); return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]); } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index fb35f43ec6d..f85a730815b 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -31,9 +31,9 @@ #include #include "arrow/array.h" +#include "arrow/array/builder_binary.h" #include "arrow/array/concatenate.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 7bfb39532dd..9c5e630bb2b 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -100,6 +100,8 @@ std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { copy->offset = off; if (null_count == length) { copy->null_count = len; + } else if (off == offset && len == length) { // A copy of current. + copy->null_count = null_count.load(); } else { copy->null_count = null_count != 0 ? kUnknownNullCount : 0; } diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 5bf584c8216..aa027ac22de 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -189,16 +189,5 @@ struct DictionaryTraits> { } }; -template -struct DictionaryCTraits { - using ArrowType = typename CTypeTraits::ArrowType; - using MemoTableType = typename DictionaryTraits::MemoTableType; -}; - -template <> -struct DictionaryCTraits { - using MemoTableType = DictionaryTraits::MemoTableType; -}; - } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index bfe46d4762c..b80ed2fd955 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -16,10 +16,8 @@ // under the License. #include -#include #include #include -#include #include #include #include @@ -31,8 +29,6 @@ #include "arrow/array.h" #include "arrow/array/diff.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/compute/api.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3063f5580cd..5bc0bf31d07 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -64,6 +64,13 @@ struct ValidateArrayVisitor { return Status::OK(); } + Status Visit(const Decimal256Array& array) { + if (array.length() > 0 && array.values() == nullptr) { + return Status::Invalid("values is null"); + } + return Status::OK(); + } + Status Visit(const StringArray& array) { return ValidateBinaryArray(array); } Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); } diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 6aac70eebfd..9215d9ab544 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -206,7 +206,7 @@ class PoolBuffer : public ResizableBuffer { } Status Resize(const int64_t new_size, bool shrink_to_fit = true) override { - if (new_size < 0) { + if (ARROW_PREDICT_FALSE(new_size < 0)) { return Status::Invalid("Negative buffer resize: ", new_size); } if (mutable_data_ && shrink_to_fit && new_size <= size_) { @@ -277,13 +277,18 @@ Result> AllocateResizableBuffer(const int64_t s } Result> AllocateBitmap(int64_t length, MemoryPool* pool) { - return AllocateBuffer(BitUtil::BytesForBits(length), pool); + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool)); + // Zero out any trailing bits + if (buf->size() > 0) { + buf->mutable_data()[buf->size() - 1] = 0; + } + return std::move(buf); } Result> AllocateEmptyBitmap(int64_t length, MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBitmap(length, pool)); + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool)); memset(buf->mutable_data(), 0, static_cast(buf->size())); - return buf; + return std::move(buf); } Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out) { diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 6be07d6ca75..f22228a4588 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -40,9 +40,18 @@ struct DictionaryBuilderCase { return CreateFor(); } + Status Visit(const NullType&) { return CreateFor(); } Status Visit(const BinaryType&) { return Create(); } Status Visit(const StringType&) { return Create(); } + Status Visit(const LargeBinaryType&) { + return Create>(); + } + Status Visit(const LargeStringType&) { + return Create>(); + } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } + Status Visit(const Decimal128Type&) { return CreateFor(); } + Status Visit(const Decimal256Type&) { return CreateFor(); } Status Visit(const DataType& value_type) { return NotImplemented(value_type); } Status Visit(const HalfFloatType& value_type) { return NotImplemented(value_type); } @@ -59,17 +68,21 @@ struct DictionaryBuilderCase { template Status Create() { + BuilderType* builder; if (dictionary != nullptr) { - out->reset(new BuilderType(dictionary, pool)); + builder = new BuilderType(dictionary, pool); } else { - out->reset(new BuilderType(value_type, pool)); + auto start_int_size = internal::GetByteWidth(*index_type); + builder = new BuilderType(start_int_size, value_type, pool); } + out->reset(builder); return Status::OK(); } Status Make() { return VisitTypeInline(*value_type, this); } MemoryPool* pool; + const std::shared_ptr& index_type; const std::shared_ptr& value_type; const std::shared_ptr& dictionary; std::unique_ptr* out; @@ -126,10 +139,12 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(LargeBinary); BUILDER_CASE(FixedSizeBinary); BUILDER_CASE(Decimal128); + BUILDER_CASE(Decimal256); case Type::DICTIONARY: { const auto& dict_type = static_cast(*type); - DictionaryBuilderCase visitor = {pool, dict_type.value_type(), nullptr, out}; + DictionaryBuilderCase visitor = {pool, dict_type.index_type(), + dict_type.value_type(), nullptr, out}; return visitor.Make(); } @@ -199,7 +214,8 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& const std::shared_ptr& dictionary, std::unique_ptr* out) { const auto& dict_type = static_cast(*type); - DictionaryBuilderCase visitor = {pool, dict_type.value_type(), dictionary, out}; + DictionaryBuilderCase visitor = {pool, dict_type.index_type(), dict_type.value_type(), + dictionary, out}; return visitor.Make(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 3202312c47e..4b80e558004 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -30,30 +30,3 @@ #include "arrow/array/builder_union.h" // IWYU pragma: keep #include "arrow/status.h" #include "arrow/util/visibility.h" - -namespace arrow { - -class DataType; -class MemoryPool; - -/// \brief Construct an empty ArrayBuilder corresponding to the data -/// type -/// \param[in] pool the MemoryPool to use for allocations -/// \param[in] type an instance of DictionaryType -/// \param[out] out the created ArrayBuilder -ARROW_EXPORT -Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::unique_ptr* out); - -/// \brief Construct an empty DictionaryBuilder initialized optionally -/// with a pre-existing dictionary -/// \param[in] pool the MemoryPool to use for allocations -/// \param[in] type an instance of DictionaryType -/// \param[in] dictionary the initial dictionary, if any. May be nullptr -/// \param[out] out the created ArrayBuilder -ARROW_EXPORT -Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, - const std::shared_ptr& dictionary, - std::unique_ptr* out); - -} // namespace arrow diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h index 821bc961281..a78170dbdbc 100644 --- a/cpp/src/arrow/c/abi.h +++ b/cpp/src/arrow/c/abi.h @@ -60,6 +60,44 @@ struct ArrowArray { void* private_data; }; +// EXPERIMENTAL: C stream interface + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + #ifdef __cplusplus } #endif diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 1e602a6a310..5cb3e577235 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -18,6 +18,7 @@ #include "arrow/c/bridge.h" #include +#include #include #include #include @@ -303,9 +304,16 @@ struct SchemaExporter { return SetFormat("w:" + std::to_string(type.byte_width())); } - Status Visit(const Decimal128Type& type) { - return SetFormat("d:" + std::to_string(type.precision()) + "," + - std::to_string(type.scale())); + Status Visit(const DecimalType& type) { + if (type.bit_width() == 128) { + // 128 is the default bit-width + return SetFormat("d:" + std::to_string(type.precision()) + "," + + std::to_string(type.scale())); + } else { + return SetFormat("d:" + std::to_string(type.precision()) + "," + + std::to_string(type.scale()) + "," + + std::to_string(type.bit_width())); + } } Status Visit(const BinaryType& type) { return SetFormat("z"); } @@ -972,13 +980,20 @@ struct SchemaImporter { Status ProcessDecimal() { RETURN_NOT_OK(f_parser_.CheckNext(':')); ARROW_ASSIGN_OR_RAISE(auto prec_scale, f_parser_.ParseInts(f_parser_.Rest())); - if (prec_scale.size() != 2) { + // 3 elements indicates bit width was communicated as well. + if (prec_scale.size() != 2 && prec_scale.size() != 3) { return f_parser_.Invalid(); } if (prec_scale[0] <= 0 || prec_scale[1] <= 0) { return f_parser_.Invalid(); } - type_ = decimal(prec_scale[0], prec_scale[1]); + if (prec_scale.size() == 2 || prec_scale[2] == 128) { + type_ = decimal(prec_scale[0], prec_scale[1]); + } else if (prec_scale[2] == 256) { + type_ = decimal256(prec_scale[0], prec_scale[1]); + } else { + return f_parser_.Invalid(); + } return Status::OK(); } @@ -1501,4 +1516,197 @@ Result> ImportRecordBatch(struct ArrowArray* array, return ImportRecordBatch(array, *maybe_schema); } +////////////////////////////////////////////////////////////////////////// +// C stream export + +namespace { + +class ExportedArrayStream { + public: + struct PrivateData { + explicit PrivateData(std::shared_ptr reader) + : reader_(std::move(reader)) {} + + std::shared_ptr reader_; + std::string last_error_; + + PrivateData() = default; + ARROW_DISALLOW_COPY_AND_ASSIGN(PrivateData); + }; + + explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {} + + Status GetSchema(struct ArrowSchema* out_schema) { + return ExportSchema(*reader()->schema(), out_schema); + } + + Status GetNext(struct ArrowArray* out_array) { + std::shared_ptr batch; + RETURN_NOT_OK(reader()->ReadNext(&batch)); + if (batch == nullptr) { + // End of stream + ArrowArrayMarkReleased(out_array); + return Status::OK(); + } else { + return ExportRecordBatch(*batch, out_array); + } + } + + const char* GetLastError() { + const auto& last_error = private_data()->last_error_; + return last_error.empty() ? nullptr : last_error.c_str(); + } + + void Release() { + if (ArrowArrayStreamIsReleased(stream_)) { + return; + } + DCHECK_NE(private_data(), nullptr); + delete private_data(); + + ArrowArrayStreamMarkReleased(stream_); + } + + // C-compatible callbacks + + static int StaticGetSchema(struct ArrowArrayStream* stream, + struct ArrowSchema* out_schema) { + ExportedArrayStream self{stream}; + return self.ToCError(self.GetSchema(out_schema)); + } + + static int StaticGetNext(struct ArrowArrayStream* stream, + struct ArrowArray* out_array) { + ExportedArrayStream self{stream}; + return self.ToCError(self.GetNext(out_array)); + } + + static void StaticRelease(struct ArrowArrayStream* stream) { + ExportedArrayStream{stream}.Release(); + } + + static const char* StaticGetLastError(struct ArrowArrayStream* stream) { + return ExportedArrayStream{stream}.GetLastError(); + } + + private: + int ToCError(const Status& status) { + if (ARROW_PREDICT_TRUE(status.ok())) { + private_data()->last_error_.clear(); + return 0; + } + private_data()->last_error_ = status.ToString(); + switch (status.code()) { + case StatusCode::IOError: + return EIO; + case StatusCode::NotImplemented: + return ENOSYS; + case StatusCode::OutOfMemory: + return ENOMEM; + default: + return EINVAL; // Fallback for Invalid, TypeError, etc. + } + } + + PrivateData* private_data() { + return reinterpret_cast(stream_->private_data); + } + + const std::shared_ptr& reader() { return private_data()->reader_; } + + struct ArrowArrayStream* stream_; +}; + +} // namespace + +Status ExportRecordBatchReader(std::shared_ptr reader, + struct ArrowArrayStream* out) { + out->get_schema = ExportedArrayStream::StaticGetSchema; + out->get_next = ExportedArrayStream::StaticGetNext; + out->get_last_error = ExportedArrayStream::StaticGetLastError; + out->release = ExportedArrayStream::StaticRelease; + out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)}; + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////// +// C stream import + +namespace { + +class ArrayStreamBatchReader : public RecordBatchReader { + public: + explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) { + ArrowArrayStreamMove(stream, &stream_); + DCHECK(!ArrowArrayStreamIsReleased(&stream_)); + } + + ~ArrayStreamBatchReader() { + ArrowArrayStreamRelease(&stream_); + DCHECK(ArrowArrayStreamIsReleased(&stream_)); + } + + std::shared_ptr schema() const override { return CacheSchema(); } + + Status ReadNext(std::shared_ptr* batch) override { + struct ArrowArray c_array; + RETURN_NOT_OK(StatusFromCError(stream_.get_next(&stream_, &c_array))); + if (ArrowArrayIsReleased(&c_array)) { + // End of stream + batch->reset(); + return Status::OK(); + } else { + return ImportRecordBatch(&c_array, CacheSchema()).Value(batch); + } + } + + private: + std::shared_ptr CacheSchema() const { + if (!schema_) { + struct ArrowSchema c_schema; + ARROW_CHECK_OK(StatusFromCError(stream_.get_schema(&stream_, &c_schema))); + schema_ = ImportSchema(&c_schema).ValueOrDie(); + } + return schema_; + } + + Status StatusFromCError(int errno_like) const { + if (ARROW_PREDICT_TRUE(errno_like == 0)) { + return Status::OK(); + } + StatusCode code; + switch (errno_like) { + case EDOM: + case EINVAL: + case ERANGE: + code = StatusCode::Invalid; + break; + case ENOMEM: + code = StatusCode::OutOfMemory; + break; + case ENOSYS: + code = StatusCode::NotImplemented; + default: + code = StatusCode::IOError; + break; + } + const char* last_error = stream_.get_last_error(&stream_); + return Status(code, last_error ? std::string(last_error) : ""); + } + + mutable struct ArrowArrayStream stream_; + mutable std::shared_ptr schema_; +}; + +} // namespace + +Result> ImportRecordBatchReader( + struct ArrowArrayStream* stream) { + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + // XXX should we call get_schema() here to avoid crashing on error? + return std::make_shared(stream); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 8efb5d98bed..294f53e49fb 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -29,6 +29,10 @@ namespace arrow { +/// \defgroup c-data-interface Functions for working with the C data interface. +/// +/// @{ + /// \brief Export C++ DataType using the C data interface format. /// /// The root type is considered to have empty name and metadata. @@ -160,4 +164,34 @@ ARROW_EXPORT Result> ImportRecordBatch(struct ArrowArray* array, struct ArrowSchema* schema); +/// @} + +/// \defgroup c-stream-interface Functions for working with the C data interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: Export C++ RecordBatchReader using the C stream interface. +/// +/// The resulting ArrowArrayStream struct keeps the record batch reader alive +/// until its release callback is called by the consumer. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportRecordBatchReader(std::shared_ptr reader, + struct ArrowArrayStream* out); + +/// \brief EXPERIMENTAL: Import C++ RecordBatchReader from the C stream interface. +/// +/// The ArrowArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportRecordBatchReader( + struct ArrowArrayStream* stream); + +/// @} + } // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 6695d6ed5db..fc11f126e72 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -22,6 +23,7 @@ #include #include +#include #include #include "arrow/c/bridge.h" @@ -40,6 +42,8 @@ namespace arrow { using internal::ArrayExportGuard; using internal::ArrayExportTraits; +using internal::ArrayStreamExportGuard; +using internal::ArrayStreamExportTraits; using internal::SchemaExportGuard; using internal::SchemaExportTraits; @@ -78,11 +82,11 @@ class ReleaseCallback { explicit ReleaseCallback(CType* c_struct) : called_(false) { orig_release_ = c_struct->release; orig_private_data_ = c_struct->private_data; - c_struct->release = ReleaseUnbound; + c_struct->release = StaticRelease; c_struct->private_data = this; } - static void ReleaseUnbound(CType* c_struct) { + static void StaticRelease(CType* c_struct) { reinterpret_cast(c_struct->private_data)->Release(c_struct); } @@ -277,6 +281,7 @@ TEST_F(TestSchemaExport, Primitive) { TestPrimitive(large_utf8(), "U"); TestPrimitive(decimal(16, 4), "d:16,4"); + TestPrimitive(decimal256(16, 4), "d:16,4,256"); } TEST_F(TestSchemaExport, Temporal) { @@ -736,6 +741,7 @@ TEST_F(TestArrayExport, Primitive) { TestPrimitive(large_utf8(), R"(["foo", "bar", null])"); TestPrimitive(decimal(16, 4), R"(["1234.5670", null])"); + TestPrimitive(decimal256(16, 4), R"(["1234.5670", null])"); } TEST_F(TestArrayExport, PrimitiveSliced) { @@ -1182,6 +1188,13 @@ TEST_F(TestSchemaImport, Primitive) { CheckImport(field("", float32())); FillPrimitive("g"); CheckImport(field("", float64())); + + FillPrimitive("d:16,4"); + CheckImport(field("", decimal128(16, 4))); + FillPrimitive("d:16,4,128"); + CheckImport(field("", decimal128(16, 4))); + FillPrimitive("d:16,4,256"); + CheckImport(field("", decimal256(16, 4))); } TEST_F(TestSchemaImport, Temporal) { @@ -2369,6 +2382,8 @@ TEST_F(TestSchemaRoundtrip, Primitive) { TestWithTypeFactory(float16); TestWithTypeFactory(std::bind(decimal, 19, 4)); + TestWithTypeFactory(std::bind(decimal128, 19, 4)); + TestWithTypeFactory(std::bind(decimal256, 19, 4)); TestWithTypeFactory(std::bind(fixed_size_binary, 3)); TestWithTypeFactory(binary); TestWithTypeFactory(large_utf8); @@ -2426,7 +2441,7 @@ TEST_F(TestSchemaRoundtrip, Map) { TEST_F(TestSchemaRoundtrip, Schema) { auto f1 = field("f1", utf8(), /*nullable=*/false); - auto f2 = field("f2", list(decimal(19, 4))); + auto f2 = field("f2", list(decimal256(19, 4))); auto md1 = key_value_metadata(kMetadataKeys1, kMetadataValues1); auto md2 = key_value_metadata(kMetadataKeys2, kMetadataValues2); @@ -2570,8 +2585,13 @@ TEST_F(TestArrayRoundtrip, Primitive) { TestWithJSON(int32(), "[]"); TestWithJSON(int32(), "[4, 5, null]"); + TestWithJSON(decimal128(16, 4), R"(["0.4759", "1234.5670", null])"); + TestWithJSON(decimal256(16, 4), R"(["0.4759", "1234.5670", null])"); + TestWithJSONSliced(int32(), "[4, 5]"); TestWithJSONSliced(int32(), "[4, 5, 6, null]"); + TestWithJSONSliced(decimal128(16, 4), R"(["0.4759", "1234.5670", null])"); + TestWithJSONSliced(decimal256(16, 4), R"(["0.4759", "1234.5670", null])"); } TEST_F(TestArrayRoundtrip, UnknownNullCount) { @@ -2678,4 +2698,248 @@ TEST_F(TestArrayRoundtrip, RecordBatch) { // TODO C -> C++ -> C roundtripping tests? +//////////////////////////////////////////////////////////////////////////// +// Array stream export tests + +class FailingRecordBatchReader : public RecordBatchReader { + public: + explicit FailingRecordBatchReader(Status error) : error_(std::move(error)) {} + + static std::shared_ptr expected_schema() { return arrow::schema({}); } + + std::shared_ptr schema() const override { return expected_schema(); } + + Status ReadNext(std::shared_ptr* batch) override { return error_; } + + protected: + Status error_; +}; + +class BaseArrayStreamTest : public ::testing::Test { + public: + void SetUp() override { + pool_ = default_memory_pool(); + orig_allocated_ = pool_->bytes_allocated(); + } + + void TearDown() override { ASSERT_EQ(pool_->bytes_allocated(), orig_allocated_); } + + RecordBatchVector MakeBatches(std::shared_ptr schema, ArrayVector arrays) { + DCHECK_EQ(schema->num_fields(), 1); + RecordBatchVector batches; + for (const auto& array : arrays) { + batches.push_back(RecordBatch::Make(schema, array->length(), {array})); + } + return batches; + } + + protected: + MemoryPool* pool_; + int64_t orig_allocated_; +}; + +class TestArrayStreamExport : public BaseArrayStreamTest { + public: + void AssertStreamSchema(struct ArrowArrayStream* c_stream, const Schema& expected) { + struct ArrowSchema c_schema; + ASSERT_EQ(0, c_stream->get_schema(c_stream, &c_schema)); + + SchemaExportGuard schema_guard(&c_schema); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); + AssertSchemaEqual(expected, *schema); + } + + void AssertStreamEnd(struct ArrowArrayStream* c_stream) { + struct ArrowArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + ArrayExportGuard guard(&c_array); + ASSERT_TRUE(ArrowArrayIsReleased(&c_array)); + } + + void AssertStreamNext(struct ArrowArrayStream* c_stream, const RecordBatch& expected) { + struct ArrowArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + ArrayExportGuard guard(&c_array); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array)); + + ASSERT_OK_AND_ASSIGN(auto batch, ImportRecordBatch(&c_array, expected.schema())); + AssertBatchesEqual(expected, *batch); + } +}; + +TEST_F(TestArrayStreamExport, Empty) { + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(schema, {}); + ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, schema)); + + struct ArrowArrayStream c_stream; + + ASSERT_OK(ExportRecordBatchReader(reader, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + AssertStreamSchema(&c_stream, *schema); + AssertStreamEnd(&c_stream); + AssertStreamEnd(&c_stream); +} + +TEST_F(TestArrayStreamExport, Simple) { + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches( + schema, {ArrayFromJSON(int32(), "[1, 2]"), ArrayFromJSON(int32(), "[4, 5, null]")}); + ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, schema)); + + struct ArrowArrayStream c_stream; + + ASSERT_OK(ExportRecordBatchReader(reader, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + AssertStreamSchema(&c_stream, *schema); + AssertStreamNext(&c_stream, *batches[0]); + AssertStreamNext(&c_stream, *batches[1]); + AssertStreamEnd(&c_stream); + AssertStreamEnd(&c_stream); +} + +TEST_F(TestArrayStreamExport, ArrayLifetime) { + auto schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches( + schema, {ArrayFromJSON(int32(), "[1, 2]"), ArrayFromJSON(int32(), "[4, 5, null]")}); + ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, schema)); + + struct ArrowArrayStream c_stream; + struct ArrowSchema c_schema; + struct ArrowArray c_array0, c_array1; + + ASSERT_OK(ExportRecordBatchReader(reader, &c_stream)); + { + ArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array0)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array1)); + AssertStreamEnd(&c_stream); + } + + ArrayExportGuard guard0(&c_array0), guard1(&c_array1); + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_schema, ImportSchema(&c_schema)); + AssertSchemaEqual(*schema, *got_schema); + } + + ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); + ASSERT_OK_AND_ASSIGN(auto batch, ImportRecordBatch(&c_array1, schema)); + AssertBatchesEqual(*batches[1], *batch); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_array0, schema)); + AssertBatchesEqual(*batches[0], *batch); +} + +TEST_F(TestArrayStreamExport, Errors) { + auto reader = + std::make_shared(Status::Invalid("some example error")); + + struct ArrowArrayStream c_stream; + + ASSERT_OK(ExportRecordBatchReader(reader, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + struct ArrowSchema c_schema; + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); + AssertSchemaEqual(schema, arrow::schema({})); + } + + struct ArrowArray c_array; + ASSERT_EQ(EINVAL, c_stream.get_next(&c_stream, &c_array)); +} + +//////////////////////////////////////////////////////////////////////////// +// Array stream roundtrip tests + +class TestArrayStreamRoundtrip : public BaseArrayStreamTest { + public: + void Roundtrip(std::shared_ptr* reader, + struct ArrowArrayStream* c_stream) { + ASSERT_OK(ExportRecordBatchReader(*reader, c_stream)); + ASSERT_FALSE(ArrowArrayStreamIsReleased(c_stream)); + + ASSERT_OK_AND_ASSIGN(auto got_reader, ImportRecordBatchReader(c_stream)); + *reader = std::move(got_reader); + } + + void Roundtrip( + std::shared_ptr reader, + std::function&)> check_func) { + ArrowArrayStream c_stream; + + // NOTE: ReleaseCallback<> is not immediately usable with ArrowArrayStream, + // because get_next and get_schema need the original private_data. + std::weak_ptr weak_reader(reader); + ASSERT_EQ(weak_reader.use_count(), 1); // Expiration check will fail otherwise + + ASSERT_OK(ExportRecordBatchReader(std::move(reader), &c_stream)); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + { + ASSERT_OK_AND_ASSIGN(auto new_reader, ImportRecordBatchReader(&c_stream)); + // Stream was moved + ASSERT_TRUE(ArrowArrayStreamIsReleased(&c_stream)); + ASSERT_FALSE(weak_reader.expired()); + + check_func(new_reader); + } + // Stream was released when `new_reader` was destroyed + ASSERT_TRUE(weak_reader.expired()); + } + + void AssertReaderNext(const std::shared_ptr& reader, + const RecordBatch& expected) { + ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); + ASSERT_NE(batch, nullptr); + AssertBatchesEqual(expected, *batch); + } + + void AssertReaderEnd(const std::shared_ptr& reader) { + ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); + ASSERT_EQ(batch, nullptr); + } +}; + +TEST_F(TestArrayStreamRoundtrip, Simple) { + auto orig_schema = arrow::schema({field("ints", int32())}); + auto batches = MakeBatches(orig_schema, {ArrayFromJSON(int32(), "[1, 2]"), + ArrayFromJSON(int32(), "[4, 5, null]")}); + + ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, orig_schema)); + + Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { + AssertSchemaEqual(*orig_schema, *reader->schema()); + AssertReaderNext(reader, *batches[0]); + AssertReaderNext(reader, *batches[1]); + AssertReaderEnd(reader); + AssertReaderEnd(reader); + }); +} + +TEST_F(TestArrayStreamRoundtrip, Errors) { + auto reader = std::make_shared( + Status::Invalid("roundtrip error example")); + + Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { + auto status = reader->Next().status(); + ASSERT_RAISES(Invalid, status); + ASSERT_THAT(status.message(), ::testing::HasSubstr("roundtrip error example")); + }); +} + } // namespace arrow diff --git a/cpp/src/arrow/c/helpers.h b/cpp/src/arrow/c/helpers.h index a1a1240dd75..a5c1f6fe4ba 100644 --- a/cpp/src/arrow/c/helpers.h +++ b/cpp/src/arrow/c/helpers.h @@ -82,6 +82,36 @@ inline void ArrowArrayRelease(struct ArrowArray* array) { } } +/// Query whether the C array stream is released +inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) { + return stream->release == NULL; +} + +/// Mark the C array stream released (for use in release callbacks) +inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) { + stream->release = NULL; +} + +/// Move the C array stream from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid stream already, otherwise there +/// will be a memory leak. +inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dest) { + assert(dest != src); + assert(!ArrowArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArrayStream)); + ArrowArrayStreamMarkReleased(src); +} + +/// Release the C array stream, if necessary, by calling its release callback +inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { + if (!ArrowArrayStreamIsReleased(stream)) { + stream->release(stream); + assert(ArrowArrayStreamIsReleased(stream)); + } +} + #ifdef __cplusplus } #endif diff --git a/cpp/src/arrow/c/util_internal.h b/cpp/src/arrow/c/util_internal.h index 3ece5245205..6a33be9b0da 100644 --- a/cpp/src/arrow/c/util_internal.h +++ b/cpp/src/arrow/c/util_internal.h @@ -34,6 +34,12 @@ struct ArrayExportTraits { static constexpr auto ReleaseFunc = &ArrowArrayRelease; }; +struct ArrayStreamExportTraits { + typedef struct ArrowArrayStream CType; + static constexpr auto IsReleasedFunc = &ArrowArrayStreamIsReleased; + static constexpr auto ReleaseFunc = &ArrowArrayStreamRelease; +}; + // A RAII-style object to release a C Array / Schema struct at block scope exit. template class ExportGuard { @@ -73,6 +79,7 @@ class ExportGuard { using SchemaExportGuard = ExportGuard; using ArrayExportGuard = ExportGuard; +using ArrayStreamExportGuard = ExportGuard; } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index a4cea6f4da4..622f5cb5c5f 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -110,11 +110,12 @@ inline bool FloatingApproxEquals(const NumericArray& left, if (opts.nans_equal()) { return BaseFloatingEquals(left, right, [epsilon](T x, T y) -> bool { - return (fabs(x - y) <= epsilon) || (std::isnan(x) && std::isnan(y)); + return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y)); }); } else { - return BaseFloatingEquals( - left, right, [epsilon](T x, T y) -> bool { return fabs(x - y) <= epsilon; }); + return BaseFloatingEquals(left, right, [epsilon](T x, T y) -> bool { + return (fabs(x - y) <= epsilon) || (x == y); + }); } } @@ -352,6 +353,10 @@ class RangeEqualsVisitor { return Visit(checked_cast(left)); } + Status Visit(const Decimal256Array& left) { + return Visit(checked_cast(left)); + } + Status Visit(const NullArray& left) { ARROW_UNUSED(left); result_ = true; @@ -805,6 +810,12 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const Decimal256Type& left) { + const auto& right = checked_cast(right_); + result_ = left.precision() == right.precision() && left.scale() == right.scale(); + return Status::OK(); + } + template enable_if_t::value || is_struct_type::value, Status> Visit( const T& left) { @@ -861,7 +872,9 @@ class TypeEqualsVisitor { class ScalarEqualsVisitor { public: - explicit ScalarEqualsVisitor(const Scalar& right) : right_(right), result_(false) {} + explicit ScalarEqualsVisitor(const Scalar& right, + const EqualOptions& opts = EqualOptions::Defaults()) + : right_(right), result_(false), options_(opts) {} Status Visit(const NullScalar& left) { result_ = true; @@ -874,9 +887,26 @@ class ScalarEqualsVisitor { return Status::OK(); } + template + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + Visit(const T& left_) { + const auto& right = checked_cast(right_); + if (options_.nans_equal()) { + result_ = right.value == left_.value || + (std::isnan(right.value) && std::isnan(left_.value)); + } else { + result_ = right.value == left_.value; + } + return Status::OK(); + } + template typename std::enable_if< - std::is_base_of, T>::value || + (std::is_base_of, T>::value && + !std::is_base_of::value && + !std::is_base_of::value) || std::is_base_of, T>::value, Status>::type Visit(const T& left_) { @@ -899,6 +929,12 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const Decimal256Scalar& left) { + const auto& right = checked_cast(right_); + result_ = left.value == right.value; + return Status::OK(); + } + Status Visit(const ListScalar& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); @@ -967,6 +1003,7 @@ class ScalarEqualsVisitor { protected: const Scalar& right_; bool result_; + const EqualOptions options_; }; Status PrintDiff(const Array& left, const Array& right, std::ostream* os) { @@ -1385,7 +1422,7 @@ bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata } } -bool ScalarEquals(const Scalar& left, const Scalar& right) { +bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) { bool are_equal = false; if (&left == &right) { are_equal = true; @@ -1394,7 +1431,7 @@ bool ScalarEquals(const Scalar& left, const Scalar& right) { } else if (left.is_valid != right.is_valid) { are_equal = false; } else { - ScalarEqualsVisitor visitor(right); + ScalarEqualsVisitor visitor(right, options); auto error = VisitScalarInline(left, &visitor); DCHECK_OK(error); are_equal = visitor.result(); diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index abcf39a62e5..f7899b7c5c6 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -111,6 +111,8 @@ bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, /// Returns true if scalars are equal /// \param[in] left a Scalar /// \param[in] right a Scalar -bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right); +/// \param[in] options comparison options +bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right, + const EqualOptions& options = EqualOptions::Defaults()); } // namespace arrow diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 97fbd17f07d..e781dff90e2 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -65,4 +65,6 @@ add_arrow_compute_test(internals_test kernel_test.cc registry_test.cc) +add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") + add_subdirectory(kernels) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 2802b02105d..53ee5b9a2b2 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,5 +45,15 @@ Result Mode(const Datum& value, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } +Result Stddev(const Datum& value, const VarianceOptions& options, + ExecContext* ctx) { + return CallFunction("stddev", {value}, &options, ctx); +} + +Result Variance(const Datum& value, const VarianceOptions& options, + ExecContext* ctx) { + return CallFunction("variance", {value}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 5ae3cf9b5fe..710153740fc 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -51,11 +51,11 @@ struct ARROW_EXPORT CountOptions : public FunctionOptions { COUNT_NULL, }; - explicit CountOptions(enum Mode count_mode) : count_mode(count_mode) {} + explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {} static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); } - enum Mode count_mode = COUNT_NON_NULL; + enum Mode count_mode; }; /// \brief Control MinMax kernel behavior @@ -66,14 +66,26 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { /// Skip null values SKIP = 0, /// Any nulls will result in null output - OUTPUT_NULL + EMIT_NULL }; explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {} static MinMaxOptions Defaults() { return MinMaxOptions{}; } - enum Mode null_handling = SKIP; + enum Mode null_handling; +}; + +/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel +/// +/// The divisor used in calculations is N - ddof, where N is the number of elements. +/// By default, ddof is zero, and population variance or stddev is returned. +struct ARROW_EXPORT VarianceOptions : public FunctionOptions { + explicit VarianceOptions(int ddof = 0) : ddof(ddof) {} + + static VarianceOptions Defaults() { return VarianceOptions{}; } + + int ddof = 0; }; /// @} @@ -130,37 +142,48 @@ Result MinMax(const Datum& value, const MinMaxOptions& options = MinMaxOptions::Defaults(), ExecContext* ctx = NULLPTR); -/// \brief Calculate the min / max of a numeric array. +/// \brief Calculate the modal (most common) value of a numeric array /// -/// This function returns both the min and max as a collection. The resulting -/// datum thus consists of two scalar datums: {Datum(min), Datum(max)} +/// This function returns both mode and count as a struct scalar, with type +/// struct, where T is the input type. +/// If there is more than one such value, the smallest one is returned. /// -/// \param[in] array input array -/// \param[in] options see MinMaxOptions for more information +/// \param[in] value input datum, expecting Array or ChunkedArray /// \param[in] ctx the function execution context, optional -/// \return resulting datum containing a {min, max} collection +/// \return resulting datum as a struct scalar /// -/// \since 1.0.0 +/// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result MinMax(const Array& array, - const MinMaxOptions& options = MinMaxOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); -/// \brief Calculate the modal (most common) value of a numeric array +/// \brief Calculate the standard deviation of a numeric array /// -/// This function returns both mode and count as a struct scalar, with type -/// struct, where T is the input type. -/// If there is more than one such value, the smallest one is returned. +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see VarianceOptions for more information +/// \param[in] ctx the function execution context, optional +/// \return datum of the computed standard deviation as a DoubleScalar +/// +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Stddev(const Datum& value, + const VarianceOptions& options = VarianceOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Calculate the variance of a numeric array /// -/// \param[in] value input datum, expecting Array +/// \param[in] value input datum, expecting Array or ChunkedArray +/// \param[in] options see VarianceOptions for more information /// \param[in] ctx the function execution context, optional -/// \return resulting datum as a struct scalar +/// \return datum of the computed variance as a DoubleScalar /// /// \since 2.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Mode(const Datum& value, ExecContext* ctx = NULLPTR); +Result Variance(const Datum& value, + const VarianceOptions& options = VarianceOptions::Defaults(), + ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 9a911030999..353151eade2 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -51,6 +51,7 @@ namespace compute { SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked") SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked") SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked") +SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked") // ---------------------------------------------------------------------- // Set-related operations diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 80e3ebb98b3..62d52d245fb 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -49,6 +49,25 @@ struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { std::string pattern; }; +struct ARROW_EXPORT SplitOptions : public FunctionOptions { + explicit SplitOptions(int64_t max_splits = -1, bool reverse = false) + : max_splits(max_splits), reverse(reverse) {} + + /// Maximum number of splits allowed, or unlimited when -1 + int64_t max_splits; + /// Start splitting from the end of the string (only relevant when max_splits != -1) + bool reverse; +}; + +struct ARROW_EXPORT SplitPatternOptions : public SplitOptions { + explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1, + bool reverse = false) + : SplitOptions(max_splits, reverse), pattern(std::move(pattern)) {} + + /// The exact substring to look for inside input values. + std::string pattern; +}; + /// Options for IsIn and IndexIn functions struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, bool skip_nulls) @@ -129,6 +148,20 @@ Result Multiply(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR); +/// \brief Divide two values. Array values must be the same length. If either +/// argument is null the result will be null. For integer types, if there is +/// a zero divisor, an error will be raised. +/// +/// \param[in] left the dividend +/// \param[in] right the divisor +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise quotient +ARROW_EXPORT +Result Divide(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + /// \brief Compare a numeric array with a scalar. /// /// \param[in] left datum to compare, must be an Array diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index de36202f019..2c77e8ee155 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -59,7 +59,7 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions { }; /// \brief Partitioning options for NthToIndices -struct PartitionNthOptions : public FunctionOptions { +struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions { explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {} /// The index into the equivalent sorted array of the partition pivot element. diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index a9700f3159d..29a80f73241 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -38,8 +38,10 @@ using internal::ToTypeName; namespace compute { namespace internal { +namespace { + std::unordered_map> g_cast_table; -static std::once_flag cast_table_initialized; +std::once_flag cast_table_initialized; void AddCastFunctions(const std::vector>& funcs) { for (const auto& func : funcs) { @@ -57,8 +59,6 @@ void InitCastTable() { void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); } -namespace { - // Private version of GetCastFunction with better error reporting // if the input type is known. Result> GetCastFunctionInternal( @@ -78,13 +78,17 @@ Result> GetCastFunctionInternal( return it->second; } -} // namespace +const FunctionDoc cast_doc{"Cast values to another data type", + ("Behavior when values wouldn't fit in the target type\n" + "can be controlled through CastOptions."), + {"input"}, + "CastOptions"}; -// Metafunction for dispatching to appropraite CastFunction. This corresponds +// Metafunction for dispatching to appropriate CastFunction. This corresponds // to the standard SQL CAST(expr AS target_type) class CastMetaFunction : public MetaFunction { public: - CastMetaFunction() : MetaFunction("cast", Arity::Unary()) {} + CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {} Result ValidateOptions(const FunctionOptions* options) const { auto cast_options = static_cast(options); @@ -112,6 +116,8 @@ class CastMetaFunction : public MetaFunction { } }; +} // namespace + void RegisterScalarCast(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); } @@ -124,7 +130,7 @@ struct CastFunction::CastFunctionImpl { }; CastFunction::CastFunction(std::string name, Type::type out_type) - : ScalarFunction(std::move(name), Arity::Unary()) { + : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr) { impl_.reset(new CastFunctionImpl()); impl_->out_type = out_type; } @@ -157,7 +163,7 @@ bool CastFunction::CanCastTo(const DataType& out_type) const { return impl_->in_types.find(static_cast(out_type.id())) != impl_->in_types.end(); } -Result CastFunction::DispatchExact( +Result CastFunction::DispatchExact( const std::vector& values) const { const int passed_num_args = static_cast(values.size()); diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 82dd357e9dd..43392ce99bf 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -98,7 +98,7 @@ class CastFunction : public ScalarFunction { bool CanCastTo(const DataType& out_type) const; - Result DispatchExact( + Result DispatchExact( const std::vector& values) const override; private: diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index fbc3693e5be..dd97119151e 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -45,6 +45,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" namespace arrow { @@ -62,32 +63,58 @@ Result> AllocateDataBuffer(KernelContext* ctx, int64_t l if (bit_width == 1) { return ctx->AllocateBitmap(length); } else { - ARROW_CHECK_EQ(bit_width % 8, 0) - << "Only bit widths with multiple of 8 are currently supported"; - int64_t buffer_size = length * bit_width / 8; + int64_t buffer_size = BitUtil::BytesForBits(length * bit_width); return ctx->Allocate(buffer_size); } return Status::OK(); } -bool CanPreallocate(const DataType& type) { - // There are currently cases where NullType is the output type, so we disable - // any preallocation logic when this occurs - return is_fixed_width(type.id()) && type.id() != Type::NA; -} +struct BufferPreallocation { + explicit BufferPreallocation(int bit_width = -1, int added_length = 0) + : bit_width(bit_width), added_length(added_length) {} -Status GetValueDescriptors(const std::vector& args, - std::vector* descrs) { - for (const auto& arg : args) { - descrs->emplace_back(arg.descr()); + int bit_width; + int added_length; +}; + +void ComputeDataPreallocate(const DataType& type, + std::vector* widths) { + if (is_fixed_width(type.id()) && type.id() != Type::NA) { + widths->emplace_back(checked_cast(type).bit_width()); + return; + } + // Preallocate binary and list offsets + switch (type.id()) { + case Type::BINARY: + case Type::STRING: + case Type::LIST: + case Type::MAP: + widths->emplace_back(32, /*added_length=*/1); + return; + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_LIST: + widths->emplace_back(64, /*added_length=*/1); + return; + default: + break; } - return Status::OK(); } } // namespace namespace detail { +Status CheckAllValues(const std::vector& values) { + for (const auto& value : values) { + if (!value.is_value()) { + return Status::Invalid("Tried executing function with non-value type: ", + value.ToString()); + } + } + return Status::OK(); +} + ExecBatchIterator::ExecBatchIterator(std::vector args, int64_t length, int64_t max_chunksize) : args_(std::move(args)), @@ -184,6 +211,8 @@ bool ExecBatchIterator::Next(ExecBatch* batch) { return true; } +namespace { + bool ArrayHasNulls(const ArrayData& data) { // As discovered in ARROW-8863 (and not only for that reason) // ArrayData::null_count can -1 even when buffers[0] is nullptr. So we check @@ -393,40 +422,18 @@ class NullPropagator { bool bitmap_preallocated_ = false; }; -Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) { - DCHECK_NE(nullptr, output); - DCHECK_GT(output->buffers.size(), 0); - - if (output->type->id() == Type::NA) { - // Null output type is a no-op (rare when this would happen but we at least - // will test for it) - return Status::OK(); - } - - // This function is ONLY able to write into output with non-zero offset - // when the bitmap is preallocated. This could be a DCHECK but returning - // error Status for now for emphasis - if (output->offset != 0 && output->buffers[0] == nullptr) { - return Status::Invalid( - "Can only propagate nulls into pre-allocated memory " - "when the output offset is non-zero"); - } - NullPropagator propagator(ctx, batch, output); - return propagator.Execute(); -} - std::shared_ptr ToChunkedArray(const std::vector& values, const std::shared_ptr& type) { std::vector> arrays; - for (const auto& val : values) { - auto boxed = val.make_array(); - if (boxed->length() == 0) { + arrays.reserve(values.size()); + for (const Datum& val : values) { + if (val.length() == 0) { // Skip empty chunks continue; } - arrays.emplace_back(std::move(boxed)); + arrays.emplace_back(val.make_array()); } - return std::make_shared(arrays, type); + return std::make_shared(std::move(arrays), type); } bool HaveChunkedArray(const std::vector& values) { @@ -438,106 +445,70 @@ bool HaveChunkedArray(const std::vector& values) { return false; } -Status CheckAllValues(const std::vector& values) { - for (const auto& value : values) { - if (!value.is_value()) { - return Status::Invalid("Tried executing function with non-value type: ", - value.ToString()); - } - } - return Status::OK(); -} - -template -class FunctionExecutorImpl : public FunctionExecutor { +template +class KernelExecutorImpl : public KernelExecutor { public: - FunctionExecutorImpl(ExecContext* exec_ctx, const FunctionType* func, - const FunctionOptions* options) - : exec_ctx_(exec_ctx), kernel_ctx_(exec_ctx), func_(func), options_(options) {} - - protected: - using KernelType = typename FunctionType::KernelType; + Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override { + kernel_ctx_ = kernel_ctx; + kernel_ = static_cast(args.kernel); - void Reset() {} + // Resolve the output descriptor for this kernel + ARROW_ASSIGN_OR_RAISE( + output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs)); - Status InitState() { - // Some kernels require initialization of an opaque state object - if (kernel_->init) { - KernelInitArgs init_args{kernel_, input_descrs_, options_}; - state_ = kernel_->init(&kernel_ctx_, init_args); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); - kernel_ctx_.SetState(state_.get()); - } return Status::OK(); } + protected: // This is overridden by the VectorExecutor virtual Status SetupArgIteration(const std::vector& args) { - ARROW_ASSIGN_OR_RAISE(batch_iterator_, - ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize())); + ARROW_ASSIGN_OR_RAISE( + batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize())); return Status::OK(); } - Status BindArgs(const std::vector& args) { - RETURN_NOT_OK(GetValueDescriptors(args, &input_descrs_)); - ARROW_ASSIGN_OR_RAISE(kernel_, func_->DispatchExact(input_descrs_)); - - // Initialize kernel state, since type resolution may depend on this state - RETURN_NOT_OK(this->InitState()); - - // Resolve the output descriptor for this kernel - ARROW_ASSIGN_OR_RAISE(output_descr_, kernel_->signature->out_type().Resolve( - &kernel_ctx_, input_descrs_)); - - return SetupArgIteration(args); - } - Result> PrepareOutput(int64_t length) { auto out = std::make_shared(output_descr_.type, length); out->buffers.resize(output_num_buffers_); if (validity_preallocated_) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_.AllocateBitmap(length)); - } - if (data_preallocated_) { - const auto& fw_type = checked_cast(*out->type); - ARROW_ASSIGN_OR_RAISE( - out->buffers[1], AllocateDataBuffer(&kernel_ctx_, length, fw_type.bit_width())); + ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length)); + } + for (size_t i = 0; i < data_preallocated_.size(); ++i) { + const auto& prealloc = data_preallocated_[i]; + if (prealloc.bit_width >= 0) { + ARROW_ASSIGN_OR_RAISE( + out->buffers[i + 1], + AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length, + prealloc.bit_width)); + } } return out; } - ValueDescr output_descr() const override { return output_descr_; } + ExecContext* exec_context() { return kernel_ctx_->exec_context(); } + KernelState* state() { return kernel_ctx_->state(); } // Not all of these members are used for every executor type - ExecContext* exec_ctx_; - KernelContext kernel_ctx_; - const FunctionType* func_; + KernelContext* kernel_ctx_; const KernelType* kernel_; std::unique_ptr batch_iterator_; - std::unique_ptr state_; - std::vector input_descrs_; ValueDescr output_descr_; - const FunctionOptions* options_; int output_num_buffers_; - // If true, then the kernel writes into a preallocated data buffer - bool data_preallocated_ = false; - // If true, then memory is preallocated for the validity bitmap with the same // strategy as the data buffer(s). bool validity_preallocated_ = false; + + // The kernel writes into data buffers preallocated for these bit widths + // (0 indicates no preallocation); + std::vector data_preallocated_; }; -class ScalarExecutor : public FunctionExecutorImpl { +class ScalarExecutor : public KernelExecutorImpl { public: - using FunctionType = ScalarFunction; - static constexpr Function::Kind function_kind = Function::SCALAR; - using BASE = FunctionExecutorImpl; - using BASE::BASE; - Status Execute(const std::vector& args, ExecListener* listener) override { RETURN_NOT_OK(PrepareExecute(args)); ExecBatch batch; @@ -574,7 +545,9 @@ class ScalarExecutor : public FunctionExecutorImpl { } else { // XXX: In the case where no outputs are omitted, is returning a 0-length // array always the correct move? - return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie(); + return MakeArrayOfNull(output_descr_.type, /*length=*/0, + exec_context()->memory_pool()) + .ValueOrDie(); } } } @@ -587,7 +560,7 @@ class ScalarExecutor : public FunctionExecutorImpl { if (output_descr_.shape == ValueDescr::ARRAY) { ArrayData* out_arr = out.mutable_array(); if (kernel_->null_handling == NullHandling::INTERSECTION) { - RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out_arr)); + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr)); } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { out_arr->null_count = 0; } @@ -602,8 +575,8 @@ class ScalarExecutor : public FunctionExecutorImpl { } } - kernel_->exec(&kernel_ctx_, batch, &out); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + kernel_->exec(kernel_ctx_, batch, &out); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); if (!preallocate_contiguous_) { // If we are producing chunked output rather than one big array, then // emit each chunk as soon as it's available @@ -613,8 +586,7 @@ class ScalarExecutor : public FunctionExecutorImpl { } Status PrepareExecute(const std::vector& args) { - this->Reset(); - RETURN_NOT_OK(this->BindArgs(args)); + RETURN_NOT_OK(this->SetupArgIteration(args)); if (output_descr_.shape == ValueDescr::ARRAY) { // If the executor is configured to produce a single large Array output for @@ -675,24 +647,27 @@ class ScalarExecutor : public FunctionExecutorImpl { output_num_buffers_ = static_cast(output_descr_.type->layout().buffers.size()); // Decide if we need to preallocate memory for this kernel - data_preallocated_ = ((kernel_->mem_allocation == MemAllocation::PREALLOCATE) && - CanPreallocate(*output_descr_.type)); validity_preallocated_ = (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE && kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL); + if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { + ComputeDataPreallocate(*output_descr_.type, &data_preallocated_); + } - // Contiguous preallocation only possible if both the VALIDITY and DATA can - // be preallocated. Otherwise, we must go chunk-by-chunk. Note that when - // the DATA cannot be preallocated, the VALIDITY may still be preallocated - // depending on the NullHandling of the kernel + // Contiguous preallocation only possible on non-nested types if all + // buffers are preallocated. Otherwise, we must go chunk-by-chunk. // - // Some kernels are unable to write into sliced outputs, so we respect the - // kernel's attributes + // Some kernels are also unable to write into sliced outputs, so we respect the + // kernel's attributes. preallocate_contiguous_ = - (exec_ctx_->preallocate_contiguous() && kernel_->can_write_into_slices && - data_preallocated_ && validity_preallocated_); + (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && + validity_preallocated_ && !is_nested(output_descr_.type->id()) && + data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && + std::all_of(data_preallocated_.begin(), data_preallocated_.end(), + [](const BufferPreallocation& prealloc) { + return prealloc.bit_width >= 0; + })); if (preallocate_contiguous_) { - DCHECK_EQ(2, output_num_buffers_); ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length)); } return Status::OK(); @@ -727,13 +702,8 @@ Status PackBatchNoChunks(const std::vector& args, ExecBatch* out) { return Status::OK(); } -class VectorExecutor : public FunctionExecutorImpl { +class VectorExecutor : public KernelExecutorImpl { public: - using FunctionType = VectorFunction; - static constexpr Function::Kind function_kind = Function::VECTOR; - using BASE = FunctionExecutorImpl; - using BASE::BASE; - Status Execute(const std::vector& args, ExecListener* listener) override { RETURN_NOT_OK(PrepareExecute(args)); ExecBatch batch; @@ -784,10 +754,10 @@ class VectorExecutor : public FunctionExecutorImpl { if (kernel_->null_handling == NullHandling::INTERSECTION && output_descr_.shape == ValueDescr::ARRAY) { - RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array())); + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array())); } - kernel_->exec(&kernel_ctx_, batch, &out); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + kernel_->exec(kernel_ctx_, batch, &out); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); if (!kernel_->finalize) { // If there is no result finalizer (e.g. for hash-based functions, we can // emit the processed batch right away rather than waiting @@ -802,8 +772,8 @@ class VectorExecutor : public FunctionExecutorImpl { if (kernel_->finalize) { // Intermediate results require post-processing after the execution is // completed (possibly involving some accumulated state) - kernel_->finalize(&kernel_ctx_, &results_); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + kernel_->finalize(kernel_ctx_, &results_); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); for (const auto& result : results_) { RETURN_NOT_OK(listener->OnResult(result)); } @@ -813,38 +783,39 @@ class VectorExecutor : public FunctionExecutorImpl { Status SetupArgIteration(const std::vector& args) override { if (kernel_->can_execute_chunkwise) { - ARROW_ASSIGN_OR_RAISE(batch_iterator_, - ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize())); + ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make( + args, exec_context()->exec_chunksize())); } return Status::OK(); } Status PrepareExecute(const std::vector& args) { - this->Reset(); - RETURN_NOT_OK(this->BindArgs(args)); + RETURN_NOT_OK(this->SetupArgIteration(args)); output_num_buffers_ = static_cast(output_descr_.type->layout().buffers.size()); // Decide if we need to preallocate memory for this kernel - data_preallocated_ = ((kernel_->mem_allocation == MemAllocation::PREALLOCATE) && - CanPreallocate(*output_descr_.type)); validity_preallocated_ = (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE && kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL); + if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { + ComputeDataPreallocate(*output_descr_.type, &data_preallocated_); + } return Status::OK(); } std::vector results_; }; -class ScalarAggExecutor : public FunctionExecutorImpl { +class ScalarAggExecutor : public KernelExecutorImpl { public: - using FunctionType = ScalarAggregateFunction; - static constexpr Function::Kind function_kind = Function::SCALAR_AGGREGATE; - using BASE = FunctionExecutorImpl; - using BASE::BASE; + Status Init(KernelContext* ctx, KernelInitArgs args) override { + input_descrs_ = &args.inputs; + options_ = args.options; + return KernelExecutorImpl::Init(ctx, args); + } Status Execute(const std::vector& args, ExecListener* listener) override { - RETURN_NOT_OK(BindArgs(args)); + RETURN_NOT_OK(this->SetupArgIteration(args)); ExecBatch batch; while (batch_iterator_->Next(&batch)) { @@ -855,8 +826,8 @@ class ScalarAggExecutor : public FunctionExecutorImpl { } Datum out; - kernel_->finalize(&kernel_ctx_, &out); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + kernel_->finalize(kernel_ctx_, &out); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); RETURN_NOT_OK(listener->OnResult(std::move(out))); return Status::OK(); } @@ -869,51 +840,74 @@ class ScalarAggExecutor : public FunctionExecutorImpl { private: Status Consume(const ExecBatch& batch) { - KernelInitArgs init_args{kernel_, input_descrs_, options_}; - auto batch_state = kernel_->init(&kernel_ctx_, init_args); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + auto batch_state = kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); if (batch_state == nullptr) { - kernel_ctx_.SetStatus( + kernel_ctx_->SetStatus( Status::Invalid("ScalarAggregation requires non-null kernel state")); - return kernel_ctx_.status(); + return kernel_ctx_->status(); } - KernelContext batch_ctx(exec_ctx_); + KernelContext batch_ctx(exec_context()); batch_ctx.SetState(batch_state.get()); kernel_->consume(&batch_ctx, batch); ARROW_CTX_RETURN_IF_ERROR(&batch_ctx); - kernel_->merge(&kernel_ctx_, *batch_state, state_.get()); - ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_); + kernel_->merge(kernel_ctx_, std::move(*batch_state), state()); + ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); return Status::OK(); } + + const std::vector* input_descrs_; + const FunctionOptions* options_; }; template -Result> MakeExecutor(ExecContext* ctx, - const Function* func, - const FunctionOptions* options) { +Result> MakeExecutor(ExecContext* ctx, + const Function* func, + const FunctionOptions* options) { DCHECK_EQ(ExecutorType::function_kind, func->kind()); auto typed_func = checked_cast(func); - return std::unique_ptr(new ExecutorType(ctx, typed_func, options)); + return std::unique_ptr(new ExecutorType(ctx, typed_func, options)); } -Result> FunctionExecutor::Make( - ExecContext* ctx, const Function* func, const FunctionOptions* options) { - switch (func->kind()) { - case Function::SCALAR: - return MakeExecutor(ctx, func, options); - case Function::VECTOR: - return MakeExecutor(ctx, func, options); - case Function::SCALAR_AGGREGATE: - return MakeExecutor(ctx, func, options); - default: - DCHECK(false); - return nullptr; +} // namespace + +Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) { + DCHECK_NE(nullptr, output); + DCHECK_GT(output->buffers.size(), 0); + + if (output->type->id() == Type::NA) { + // Null output type is a no-op (rare when this would happen but we at least + // will test for it) + return Status::OK(); + } + + // This function is ONLY able to write into output with non-zero offset + // when the bitmap is preallocated. This could be a DCHECK but returning + // error Status for now for emphasis + if (output->offset != 0 && output->buffers[0] == nullptr) { + return Status::Invalid( + "Can only propagate nulls into pre-allocated memory " + "when the output offset is non-zero"); } + NullPropagator propagator(ctx, batch, output); + return propagator.Execute(); +} + +std::unique_ptr KernelExecutor::MakeScalar() { + return ::arrow::internal::make_unique(); +} + +std::unique_ptr KernelExecutor::MakeVector() { + return ::arrow::internal::make_unique(); +} + +std::unique_ptr KernelExecutor::MakeScalarAggregate() { + return ::arrow::internal::make_unique(); } } // namespace detail @@ -952,9 +946,6 @@ Result CallFunction(const std::string& func_name, const std::vector func, ctx->func_registry()->GetFunction(func_name)); - if (options == nullptr) { - options = func->default_options(); - } return func->Execute(args, options, ctx); } diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index 507cd1703a8..8bad135e40d 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -102,22 +102,22 @@ class DatumAccumulator : public ExecListener { /// inputs will be split into non-chunked ExecBatch values for execution Status CheckAllValues(const std::vector& values); -class ARROW_EXPORT FunctionExecutor { +class ARROW_EXPORT KernelExecutor { public: - virtual ~FunctionExecutor() = default; + virtual ~KernelExecutor() = default; + + virtual Status Init(KernelContext*, KernelInitArgs) = 0; /// XXX: Better configurability for listener /// Not thread-safe virtual Status Execute(const std::vector& args, ExecListener* listener) = 0; - virtual ValueDescr output_descr() const = 0; - virtual Datum WrapResults(const std::vector& args, const std::vector& outputs) = 0; - static Result> Make(ExecContext* ctx, - const Function* func, - const FunctionOptions* options); + static std::unique_ptr MakeScalar(); + static std::unique_ptr MakeVector(); + static std::unique_ptr MakeScalarAggregate(); }; /// \brief Populate validity bitmap with the intersection of the nullity of the diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index 75a2089b3dd..e9bd57596b5 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -648,7 +648,8 @@ class TestCallScalarFunction : public TestComputeInternals { // This function simply copies memory from the input argument into the // (preallocated) output - auto func = std::make_shared("test_copy", Arity::Unary()); + auto func = + std::make_shared("test_copy", Arity::Unary(), /*doc=*/nullptr); // Add a few kernels. Our implementation only accepts arrays ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy)); @@ -657,8 +658,8 @@ class TestCallScalarFunction : public TestComputeInternals { ASSERT_OK(registry->AddFunction(func)); // A version which doesn't want the executor to call PropagateNulls - auto func2 = - std::make_shared("test_copy_computed_bitmap", Arity::Unary()); + auto func2 = std::make_shared("test_copy_computed_bitmap", + Arity::Unary(), /*doc=*/nullptr); ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecComputedBitmap); kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; ASSERT_OK(func2->AddKernel(kernel)); @@ -670,9 +671,10 @@ class TestCallScalarFunction : public TestComputeInternals { // A function that allocates its own output memory. We have cases for both // non-preallocated data and non-preallocated validity bitmap - auto f1 = std::make_shared("test_nopre_data", Arity::Unary()); - auto f2 = - std::make_shared("test_nopre_validity_or_data", Arity::Unary()); + auto f1 = std::make_shared("test_nopre_data", Arity::Unary(), + /*doc=*/nullptr); + auto f2 = std::make_shared("test_nopre_validity_or_data", + Arity::Unary(), /*doc=*/nullptr); ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecNoPreallocatedData); kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; @@ -691,7 +693,8 @@ class TestCallScalarFunction : public TestComputeInternals { // This function's behavior depends on a static parameter that is made // available to the kernel's execution function through its Options object - auto func = std::make_shared("test_stateful", Arity::Unary()); + auto func = std::make_shared("test_stateful", Arity::Unary(), + /*doc=*/nullptr); ScalarKernel kernel({InputType::Array(int32())}, int32(), ExecStateful, InitStateful); ASSERT_OK(func->AddKernel(kernel)); @@ -701,8 +704,8 @@ class TestCallScalarFunction : public TestComputeInternals { void AddScalarFunction() { auto registry = GetFunctionRegistry(); - auto func = - std::make_shared("test_scalar_add_int32", Arity::Binary()); + auto func = std::make_shared("test_scalar_add_int32", Arity::Binary(), + /*doc=*/nullptr); ASSERT_OK(func->AddKernel({InputType::Scalar(int32()), InputType::Scalar(int32())}, int32(), ExecAddInt32)); ASSERT_OK(registry->AddFunction(func)); diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 41c3e360a07..2d3e06e2fb2 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -29,6 +29,10 @@ namespace arrow { namespace compute { +static const FunctionDoc kEmptyFunctionDoc{}; + +const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; } + Status Function::CheckArity(int passed_num_args) const { if (arity_.is_varargs && passed_num_args < arity_.num_args) { return Status::Invalid("VarArgs function needs at least ", arity_.num_args, @@ -103,6 +107,9 @@ Result DispatchExactImpl(const Function& func, Result Function::Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const { + if (options == nullptr) { + options = default_options(); + } if (ctx == nullptr) { ExecContext default_ctx; return Execute(args, options, &default_ctx); @@ -110,13 +117,47 @@ Result Function::Execute(const std::vector& args, // type-check Datum arguments here. Really we'd like to avoid this as much as // possible RETURN_NOT_OK(detail::CheckAllValues(args)); - ARROW_ASSIGN_OR_RAISE(auto executor, - detail::FunctionExecutor::Make(ctx, this, options)); + std::vector inputs(args.size()); + for (size_t i = 0; i != args.size(); ++i) { + inputs[i] = args[i].descr(); + } + + ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchExact(inputs)); + std::unique_ptr state; + + KernelContext kernel_ctx{ctx}; + if (kernel->init) { + state = kernel->init(&kernel_ctx, {kernel, inputs, options}); + RETURN_NOT_OK(kernel_ctx.status()); + kernel_ctx.SetState(state.get()); + } + + std::unique_ptr executor; + if (kind() == Function::SCALAR) { + executor = detail::KernelExecutor::MakeScalar(); + } else if (kind() == Function::VECTOR) { + executor = detail::KernelExecutor::MakeVector(); + } else { + executor = detail::KernelExecutor::MakeScalarAggregate(); + } + RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options})); + auto listener = std::make_shared(); RETURN_NOT_OK(executor->Execute(args, listener.get())); return executor->WrapResults(args, listener->values()); } +Status Function::Validate() const { + if (!doc_->summary.empty()) { + // Documentation given, check its contents + if (static_cast(doc_->arg_names.size()) != arity_.num_args) { + return Status::Invalid("In function '", name_, + "': ", "number of argument names != function arity"); + } + } + return Status::OK(); +} + Status ScalarFunction::AddKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init) { RETURN_NOT_OK(CheckArity(static_cast(in_types.size()))); @@ -139,7 +180,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) { return Status::OK(); } -Result ScalarFunction::DispatchExact( +Result ScalarFunction::DispatchExact( const std::vector& values) const { return DispatchExactImpl(*this, kernels_, values); } @@ -166,7 +207,7 @@ Status VectorFunction::AddKernel(VectorKernel kernel) { return Status::OK(); } -Result VectorFunction::DispatchExact( +Result VectorFunction::DispatchExact( const std::vector& values) const { return DispatchExactImpl(*this, kernels_, values); } @@ -180,7 +221,7 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) { return Status::OK(); } -Result ScalarAggregateFunction::DispatchExact( +Result ScalarAggregateFunction::DispatchExact( const std::vector& values) const { return DispatchExactImpl(*this, kernels_, values); } @@ -189,6 +230,9 @@ Result MetaFunction::Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const { RETURN_NOT_OK(CheckArity(static_cast(args.size()))); + if (options == nullptr) { + options = default_options(); + } return ExecuteImpl(args, options, ctx); } diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 93a200ee212..a71dbe40292 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -65,7 +65,8 @@ struct ARROW_EXPORT Arity { /// invoking the function static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); } - explicit Arity(int num_args, bool is_varargs = false) + // NOTE: the 0-argument form (default constructor) is required for Cython + explicit Arity(int num_args = 0, bool is_varargs = false) : num_args(num_args), is_varargs(is_varargs) {} /// The number of required arguments (or the minimum number for varargs @@ -76,6 +77,37 @@ struct ARROW_EXPORT Arity { bool is_varargs = false; }; +struct ARROW_EXPORT FunctionDoc { + /// \brief A one-line summary of the function, using a verb. + /// + /// For example, "Add two numeric arrays or scalars". + std::string summary; + + /// \brief A detailed description of the function, meant to follow the summary. + std::string description; + + /// \brief Symbolic names (identifiers) for the function arguments. + /// + /// Some bindings may use this to generate nicer function signatures. + std::vector arg_names; + + // TODO add argument descriptions? + + /// \brief Name of the options class, if any. + std::string options_class; + + FunctionDoc() {} + + FunctionDoc(std::string summary, std::string description, + std::vector arg_names, std::string options_class = "") + : summary(std::move(summary)), + description(std::move(description)), + arg_names(std::move(arg_names)), + options_class(std::move(options_class)) {} + + static const FunctionDoc& Empty(); +}; + /// \brief Base class for compute functions. Function implementations contain a /// collection of "kernels" which are implementations of the function for /// specific argument types. Selecting a viable kernel for executing a function @@ -117,15 +149,24 @@ class ARROW_EXPORT Function { /// function accepts variable numbers of arguments. const Arity& arity() const { return arity_; } + /// \brief Return the function documentation + const FunctionDoc& doc() const { return *doc_; } + /// \brief Returns the number of registered kernels for this function. virtual int num_kernels() const = 0; + /// \brief Return a kernel that can execute the function given the exact + /// argument types (without implicit type casts or scalar->array promotions). + /// + /// NB: This function is overridden in CastFunction. + virtual Result DispatchExact( + const std::vector& values) const = 0; + /// \brief Execute the function eagerly with the passed input arguments with /// kernel dispatch, batch iteration, and memory allocation details taken /// care of. /// - /// Function implementations may assume that options is non-null and valid - /// or to forgo options and accept only nullptr for that argument. + /// If the `options` pointer is null, then `default_options()` will be used. /// /// This function can be overridden in subclasses. virtual Result Execute(const std::vector& args, @@ -137,12 +178,15 @@ class ARROW_EXPORT Function { /// that default_options() is valid to pass to Execute as options. const FunctionOptions* default_options() const { return default_options_; } + virtual Status Validate() const; + protected: Function(std::string name, Function::Kind kind, const Arity& arity, - const FunctionOptions* default_options) + const FunctionDoc* doc, const FunctionOptions* default_options) : name_(std::move(name)), kind_(kind), arity_(arity), + doc_(doc ? doc : &FunctionDoc::Empty()), default_options_(default_options) {} Status CheckArity(int passed_num_args) const; @@ -150,6 +194,7 @@ class ARROW_EXPORT Function { std::string name_; Function::Kind kind_; Arity arity_; + const FunctionDoc* doc_; const FunctionOptions* default_options_ = NULLPTR; }; @@ -171,8 +216,8 @@ class FunctionImpl : public Function { protected: FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, - const FunctionOptions* default_options) - : Function(std::move(name), kind, arity, default_options) {} + const FunctionDoc* doc, const FunctionOptions* default_options) + : Function(std::move(name), kind, arity, doc, default_options) {} std::vector kernels_; }; @@ -188,9 +233,9 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { public: using KernelType = ScalarKernel; - ScalarFunction(std::string name, const Arity& arity, + ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) - : detail::FunctionImpl(std::move(name), Function::SCALAR, arity, + : detail::FunctionImpl(std::move(name), Function::SCALAR, arity, doc, default_options) {} /// \brief Add a kernel with given input/output types, no required state @@ -203,12 +248,8 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { /// kernel's signature does not match the function's arity. Status AddKernel(ScalarKernel kernel); - /// \brief Return a kernel that can execute the function given the exact - /// argument types (without implicit type casts or scalar->array promotions). - /// - /// NB: This function is overridden in CastFunction. - virtual Result DispatchExact( - const std::vector& values) const; + Result DispatchExact( + const std::vector& values) const override; }; /// \brief A function that executes general array operations that may yield @@ -219,9 +260,9 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { public: using KernelType = VectorKernel; - VectorFunction(std::string name, const Arity& arity, + VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) - : detail::FunctionImpl(std::move(name), Function::VECTOR, arity, + : detail::FunctionImpl(std::move(name), Function::VECTOR, arity, doc, default_options) {} /// \brief Add a simple kernel with given input/output types, no required @@ -234,9 +275,8 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { /// kernel's signature does not match the function's arity. Status AddKernel(VectorKernel kernel); - /// \brief Return a kernel that can execute the function given the exact - /// argument types (without implicit type casts or scalar->array promotions) - Result DispatchExact(const std::vector& values) const; + Result DispatchExact( + const std::vector& values) const override; }; class ARROW_EXPORT ScalarAggregateFunction @@ -244,19 +284,17 @@ class ARROW_EXPORT ScalarAggregateFunction public: using KernelType = ScalarAggregateKernel; - ScalarAggregateFunction(std::string name, const Arity& arity, + ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl( - std::move(name), Function::SCALAR_AGGREGATE, arity, default_options) {} + std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {} /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. Status AddKernel(ScalarAggregateKernel kernel); - /// \brief Return a kernel that can execute the function given the exact - /// argument types (without implicit type casts or scalar->array promotions) - Result DispatchExact( - const std::vector& values) const; + Result DispatchExact( + const std::vector& values) const override; }; /// \brief A function that dispatches to other functions. Must implement @@ -271,14 +309,18 @@ class ARROW_EXPORT MetaFunction : public Function { Result Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override; + Result DispatchExact(const std::vector&) const override { + return Status::NotImplemented("DispatchExact for a MetaFunction's Kernels"); + } + protected: virtual Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const = 0; - MetaFunction(std::string name, const Arity& arity, + MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) - : Function(std::move(name), Function::META, arity, default_options) {} + : Function(std::move(name), Function::META, arity, doc, default_options) {} }; /// @} diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc new file mode 100644 index 00000000000..e2214f85174 --- /dev/null +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/array/array_base.h" +#include "arrow/compute/api.h" +#include "arrow/memory_pool.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { + +constexpr int32_t kSeed = 0xfede4a7e; +constexpr int64_t kScalarCount = 1 << 10; + +inline ScalarVector ToScalars(std::shared_ptr arr) { + ScalarVector scalars{static_cast(arr->length())}; + int64_t i = 0; + for (auto& scalar : scalars) { + scalar = arr->GetScalar(i++).ValueOrDie(); + } + return scalars; +} + +void BM_CastDispatch(benchmark::State& state) { // NOLINT non-const reference + // Repeatedly invoke a trivial Cast: the main cost should be dispatch + random::RandomArrayGenerator rag(kSeed); + + auto int_scalars = ToScalars(rag.Int64(kScalarCount, 0, 1 << 20)); + + auto double_type = float64(); + for (auto _ : state) { + Datum timestamp_scalar; + for (Datum int_scalar : int_scalars) { + ASSERT_OK_AND_ASSIGN(timestamp_scalar, Cast(int_scalar, double_type)); + } + benchmark::DoNotOptimize(timestamp_scalar); + } + + state.SetItemsProcessed(state.iterations() * kScalarCount); +} + +void BM_CastDispatchBaseline(benchmark::State& state) { // NOLINT non-const reference + // Repeatedly invoke a trivial Cast with all dispatch outside the hot loop + random::RandomArrayGenerator rag(kSeed); + + auto int_scalars = ToScalars(rag.Int64(kScalarCount, 0, 1 << 20)); + + auto double_type = float64(); + CastOptions cast_options; + cast_options.to_type = double_type; + ASSERT_OK_AND_ASSIGN(auto cast_function, GetCastFunction(double_type)); + ASSERT_OK_AND_ASSIGN(auto cast_kernel, + cast_function->DispatchExact({int_scalars[0]->type})); + const auto& exec = static_cast(cast_kernel)->exec; + + ExecContext exec_context; + KernelContext kernel_context(&exec_context); + auto cast_state = + cast_kernel->init(&kernel_context, {cast_kernel, {double_type}, &cast_options}); + ABORT_NOT_OK(kernel_context.status()); + kernel_context.SetState(cast_state.get()); + + for (auto _ : state) { + Datum timestamp_scalar = MakeNullScalar(double_type); + for (Datum int_scalar : int_scalars) { + exec(&kernel_context, {{std::move(int_scalar)}, 1}, ×tamp_scalar); + ABORT_NOT_OK(kernel_context.status()); + } + benchmark::DoNotOptimize(timestamp_scalar); + } + + state.SetItemsProcessed(state.iterations() * kScalarCount); +} + +void BM_AddDispatch(benchmark::State& state) { // NOLINT non-const reference + ExecContext exec_context; + KernelContext kernel_context(&exec_context); + + for (auto _ : state) { + ASSERT_OK_AND_ASSIGN(auto add_function, GetFunctionRegistry()->GetFunction("add")); + ASSERT_OK_AND_ASSIGN(auto add_kernel, + checked_cast(*add_function) + .DispatchExact({int64(), int64()})); + benchmark::DoNotOptimize(add_kernel); + } + + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(BM_CastDispatch)->MinTime(1.0); +BENCHMARK(BM_CastDispatchBaseline)->MinTime(1.0); +BENCHMARK(BM_AddDispatch)->MinTime(1.0); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 576659d9331..b6f1815b89e 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -57,8 +57,8 @@ TEST(Arity, Basics) { } TEST(ScalarFunction, Basics) { - ScalarFunction func("scalar_test", Arity::Binary()); - ScalarFunction varargs_func("varargs_test", Arity::VarArgs(1)); + ScalarFunction func("scalar_test", Arity::Binary(), /*doc=*/nullptr); + ScalarFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/nullptr); ASSERT_EQ("scalar_test", func.name()); ASSERT_EQ(2, func.arity().num_args); @@ -72,8 +72,8 @@ TEST(ScalarFunction, Basics) { } TEST(VectorFunction, Basics) { - VectorFunction func("vector_test", Arity::Binary()); - VectorFunction varargs_func("varargs_test", Arity::VarArgs(1)); + VectorFunction func("vector_test", Arity::Binary(), /*doc=*/nullptr); + VectorFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/nullptr); ASSERT_EQ("vector_test", func.name()); ASSERT_EQ(2, func.arity().num_args); @@ -126,7 +126,7 @@ void CheckAddDispatch(FunctionType* func) { KernelType invalid_kernel({boolean()}, boolean(), ExecNYI); ASSERT_RAISES(Invalid, func->AddKernel(invalid_kernel)); - ASSERT_OK_AND_ASSIGN(const KernelType* kernel, func->DispatchExact({int32(), int32()})); + ASSERT_OK_AND_ASSIGN(const Kernel* kernel, func->DispatchExact({int32(), int32()})); KernelSignature expected_sig(in_types1, out_type1); ASSERT_TRUE(kernel->signature->Equals(expected_sig)); @@ -139,15 +139,15 @@ void CheckAddDispatch(FunctionType* func) { } TEST(ScalarVectorFunction, DispatchExact) { - ScalarFunction func1("scalar_test", Arity::Binary()); - VectorFunction func2("vector_test", Arity::Binary()); + ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/nullptr); + VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/nullptr); CheckAddDispatch(&func1); CheckAddDispatch(&func2); } TEST(ArrayFunction, VarArgs) { - ScalarFunction va_func("va_test", Arity::VarArgs(1)); + ScalarFunction va_func("va_test", Arity::VarArgs(1), /*doc=*/nullptr); std::vector va_args = {int8()}; @@ -164,7 +164,7 @@ TEST(ArrayFunction, VarArgs) { ASSERT_RAISES(Invalid, va_func.AddKernel(non_va_kernel)); std::vector args = {ValueDescr::Scalar(int8()), int8(), int8()}; - ASSERT_OK_AND_ASSIGN(const ScalarKernel* kernel, va_func.DispatchExact(args)); + ASSERT_OK_AND_ASSIGN(const Kernel* kernel, va_func.DispatchExact(args)); ASSERT_TRUE(kernel->signature->MatchesInputs(args)); // No dispatch possible because args incompatible @@ -173,7 +173,7 @@ TEST(ArrayFunction, VarArgs) { } TEST(ScalarAggregateFunction, Basics) { - ScalarAggregateFunction func("agg_test", Arity::Unary()); + ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/nullptr); ASSERT_EQ("agg_test", func.name()); ASSERT_EQ(1, func.arity().num_args); @@ -190,7 +190,7 @@ void NoopMerge(KernelContext*, const KernelState&, KernelState*) {} void NoopFinalize(KernelContext*, Datum*) {} TEST(ScalarAggregateFunction, DispatchExact) { - ScalarAggregateFunction func("agg_test", Arity::Unary()); + ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/nullptr); std::vector in_args = {ValueDescr::Array(int8())}; ScalarAggregateKernel kernel(std::move(in_args), int64(), NoopInit, NoopConsume, @@ -215,8 +215,7 @@ TEST(ScalarAggregateFunction, DispatchExact) { ASSERT_RAISES(Invalid, func.AddKernel(kernel)); std::vector dispatch_args = {ValueDescr::Array(int8())}; - ASSERT_OK_AND_ASSIGN(const ScalarAggregateKernel* selected_kernel, - func.DispatchExact(dispatch_args)); + ASSERT_OK_AND_ASSIGN(const Kernel* selected_kernel, func.DispatchExact(dispatch_args)); ASSERT_EQ(func.kernels()[0], selected_kernel); ASSERT_TRUE(selected_kernel->signature->MatchesInputs(dispatch_args)); diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 1788eb72963..88b42716fa2 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -281,6 +281,9 @@ std::string InputType::ToString() const { } ss << "["; switch (kind_) { + case InputType::ANY_TYPE: + ss << "any"; + break; case InputType::EXACT_TYPE: ss << type_->ToString(); break; @@ -303,6 +306,8 @@ bool InputType::Equals(const InputType& other) const { return false; } switch (kind_) { + case InputType::ANY_TYPE: + return true; case InputType::EXACT_TYPE: return type_->Equals(*other.type_); case InputType::USE_TYPE_MATCHER: diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 3fb6947107e..67cb5df7908 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -664,7 +664,7 @@ struct VectorKernel : public ArrayKernel { using ScalarAggregateConsume = std::function; using ScalarAggregateMerge = - std::function; + std::function; // Finalize returns Datum to permit multiple return values using ScalarAggregateFinalize = std::function; diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc index 2eb7fd11449..a5ef9d44e18 100644 --- a/cpp/src/arrow/compute/kernel_test.cc +++ b/cpp/src/arrow/compute/kernel_test.cc @@ -38,7 +38,7 @@ TEST(TypeMatcher, SameTypeId) { ASSERT_TRUE(matcher->Matches(*decimal(12, 2))); ASSERT_FALSE(matcher->Matches(*int8())); - ASSERT_EQ("Type::DECIMAL", matcher->ToString()); + ASSERT_EQ("Type::DECIMAL128", matcher->ToString()); ASSERT_TRUE(matcher->Equals(*matcher)); ASSERT_TRUE(matcher->Equals(*match::SameTypeId(Type::DECIMAL))); @@ -103,7 +103,7 @@ TEST(InputType, Constructors) { // Same type id constructor InputType ty2(Type::DECIMAL); ASSERT_EQ(InputType::USE_TYPE_MATCHER, ty2.kind()); - ASSERT_EQ("any[Type::DECIMAL]", ty2.ToString()); + ASSERT_EQ("any[Type::DECIMAL128]", ty2.ToString()); ASSERT_TRUE(ty2.type_matcher().Matches(*decimal(12, 2))); ASSERT_FALSE(ty2.type_matcher().Matches(*int16())); @@ -135,12 +135,21 @@ TEST(InputType, Constructors) { ASSERT_EQ("array[int8]", ty1_array.ToString()); ASSERT_EQ("scalar[int8]", ty1_scalar.ToString()); - ASSERT_EQ("any[Type::DECIMAL]", ty2.ToString()); - ASSERT_EQ("array[Type::DECIMAL]", ty2_array.ToString()); - ASSERT_EQ("scalar[Type::DECIMAL]", ty2_scalar.ToString()); + ASSERT_EQ("any[Type::DECIMAL128]", ty2.ToString()); + ASSERT_EQ("array[Type::DECIMAL128]", ty2_array.ToString()); + ASSERT_EQ("scalar[Type::DECIMAL128]", ty2_scalar.ToString()); InputType ty7(match::TimestampTypeUnit(TimeUnit::MICRO)); ASSERT_EQ("any[timestamp(us)]", ty7.ToString()); + + InputType ty8; + InputType ty9(ValueDescr::ANY); + InputType ty10(ValueDescr::ARRAY); + InputType ty11(ValueDescr::SCALAR); + ASSERT_EQ("any[any]", ty8.ToString()); + ASSERT_EQ("any[any]", ty9.ToString()); + ASSERT_EQ("array[any]", ty10.ToString()); + ASSERT_EQ("scalar[any]", ty11.ToString()); } TEST(InputType, Equals) { @@ -475,14 +484,14 @@ TEST(KernelSignature, ToString) { InputType(Type::DECIMAL, ValueDescr::ARRAY), InputType(utf8())}; KernelSignature sig(in_types, utf8()); - ASSERT_EQ("(scalar[int8], array[Type::DECIMAL], any[string]) -> string", + ASSERT_EQ("(scalar[int8], array[Type::DECIMAL128], any[string]) -> string", sig.ToString()); OutputType out_type([](KernelContext*, const std::vector& args) { return Status::Invalid("NYI"); }); KernelSignature sig2({int8(), InputType(Type::DECIMAL)}, out_type); - ASSERT_EQ("(any[int8], any[Type::DECIMAL]) -> computed", sig2.ToString()); + ASSERT_EQ("(any[int8], any[Type::DECIMAL128]) -> computed", sig2.ToString()); } TEST(KernelSignature, VarArgsToString) { diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index fc147e3a69b..0bca453ab28 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -34,6 +34,7 @@ add_arrow_compute_test(scalar_test add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index e2cc2a334d8..11c1e2b1730 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include - #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_basic_internal.h" #include "arrow/compute/kernels/aggregate_internal.h" @@ -26,20 +24,34 @@ namespace arrow { namespace compute { -namespace aggregate { + +namespace { void AggregateConsume(KernelContext* ctx, const ExecBatch& batch) { checked_cast(ctx->state())->Consume(ctx, batch); } -void AggregateMerge(KernelContext* ctx, const KernelState& src, KernelState* dst) { - checked_cast(dst)->MergeFrom(ctx, src); +void AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) { + checked_cast(dst)->MergeFrom(ctx, std::move(src)); } void AggregateFinalize(KernelContext* ctx, Datum* out) { checked_cast(ctx->state())->Finalize(ctx, out); } +} // namespace + +void AddAggKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateFunction* func, SimdLevel::type simd_level) { + ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge, + AggregateFinalize); + // Set the simd level + kernel.simd_level = simd_level; + DCHECK_OK(func->AddKernel(kernel)); +} + +namespace aggregate { + // ---------------------------------------------------------------------- // Count implementation @@ -53,7 +65,7 @@ struct CountImpl : public ScalarAggregator { this->non_nulls += input.length - nulls; } - void MergeFrom(KernelContext*, const KernelState& src) override { + void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other_state = checked_cast(src); this->non_nulls += other_state.non_nulls; this->nulls += other_state.nulls; @@ -132,224 +144,13 @@ std::unique_ptr MeanInit(KernelContext* ctx, const KernelInitArgs& // ---------------------------------------------------------------------- // MinMax implementation -template -struct MinMaxState {}; - -template -struct MinMaxState> { - using ThisType = MinMaxState; - using T = typename ArrowType::c_type; - - ThisType& operator+=(const ThisType& rhs) { - this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; - this->min = this->min && rhs.min; - this->max = this->max || rhs.max; - return *this; - } - - void MergeOne(T value) { - this->min = this->min && value; - this->max = this->max || value; - } - - T min = true; - T max = false; - bool has_nulls = false; - bool has_values = false; -}; - -template -struct MinMaxState> { - using ThisType = MinMaxState; - using T = typename ArrowType::c_type; - - ThisType& operator+=(const ThisType& rhs) { - this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; - this->min = std::min(this->min, rhs.min); - this->max = std::max(this->max, rhs.max); - return *this; - } - - void MergeOne(T value) { - this->min = std::min(this->min, value); - this->max = std::max(this->max, value); - } - - T min = std::numeric_limits::max(); - T max = std::numeric_limits::min(); - bool has_nulls = false; - bool has_values = false; -}; - -template -struct MinMaxState> { - using ThisType = MinMaxState; - using T = typename ArrowType::c_type; - - ThisType& operator+=(const ThisType& rhs) { - this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; - this->min = std::fmin(this->min, rhs.min); - this->max = std::fmax(this->max, rhs.max); - return *this; - } - - void MergeOne(T value) { - this->min = std::fmin(this->min, value); - this->max = std::fmax(this->max, value); - } - - T min = std::numeric_limits::infinity(); - T max = -std::numeric_limits::infinity(); - bool has_nulls = false; - bool has_values = false; -}; - -template -struct MinMaxImpl : public ScalarAggregator { - using ArrayType = typename TypeTraits::ArrayType; - using ThisType = MinMaxImpl; - using StateType = MinMaxState; - - MinMaxImpl(const std::shared_ptr& out_type, const MinMaxOptions& options) - : out_type(out_type), options(options) {} - - void Consume(KernelContext*, const ExecBatch& batch) override { - StateType local; - - ArrayType arr(batch[0].array()); - - const auto null_count = arr.null_count(); - local.has_nulls = null_count > 0; - local.has_values = (arr.length() - null_count) > 0; - - if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { - this->state = local; - return; - } - - if (local.has_nulls) { - BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); - for (int64_t i = 0; i < arr.length(); i++) { - if (reader.IsSet()) { - local.MergeOne(arr.Value(i)); - } - reader.Next(); - } - } else { - for (int64_t i = 0; i < arr.length(); i++) { - local.MergeOne(arr.Value(i)); - } - } - this->state = local; - } - - void MergeFrom(KernelContext*, const KernelState& src) override { - const auto& other = checked_cast(src); - this->state += other.state; - } - - void Finalize(KernelContext*, Datum* out) override { - using ScalarType = typename TypeTraits::ScalarType; - - std::vector> values; - if (!state.has_values || - (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL)) { - // (null, null) - values = {std::make_shared(), std::make_shared()}; - } else { - values = {std::make_shared(state.min), - std::make_shared(state.max)}; - } - out->value = std::make_shared(std::move(values), this->out_type); - } - - std::shared_ptr out_type; - MinMaxOptions options; - MinMaxState state; -}; - -struct BooleanMinMaxImpl : public MinMaxImpl { - using MinMaxImpl::MinMaxImpl; - - void Consume(KernelContext*, const ExecBatch& batch) override { - StateType local; - ArrayType arr(batch[0].array()); - - const auto arr_length = arr.length(); - const auto null_count = arr.null_count(); - const auto valid_count = arr_length - null_count; - - local.has_nulls = null_count > 0; - local.has_values = valid_count > 0; - if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { - this->state = local; - return; - } - - const auto true_count = arr.true_count(); - const auto false_count = valid_count - true_count; - local.max = true_count > 0; - local.min = false_count == 0; - - this->state = local; - } -}; - -struct MinMaxInitState { - std::unique_ptr state; - KernelContext* ctx; - const DataType& in_type; - const std::shared_ptr& out_type; - const MinMaxOptions& options; - - MinMaxInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type, const MinMaxOptions& options) - : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} - - Status Visit(const DataType&) { - return Status::NotImplemented("No min/max implemented"); - } - - Status Visit(const HalfFloatType&) { - return Status::NotImplemented("No sum implemented"); - } - - Status Visit(const BooleanType&) { - state.reset(new BooleanMinMaxImpl(out_type, options)); - return Status::OK(); - } - - template - enable_if_number Visit(const Type&) { - state.reset(new MinMaxImpl(out_type, options)); - return Status::OK(); - } - - std::unique_ptr Create() { - ctx->SetStatus(VisitTypeInline(in_type, this)); - return std::move(state); - } -}; - std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { - MinMaxInitState visitor(ctx, *args.inputs[0].type, - args.kernel->signature->out_type().type(), - static_cast(*args.options)); + MinMaxInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options)); return visitor.Create(); } -void AddAggKernel(std::shared_ptr sig, KernelInit init, - ScalarAggregateFunction* func, SimdLevel::type simd_level) { - ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge, - AggregateFinalize); - // Set the simd level - kernel.simd_level = simd_level; - DCHECK_OK(func->AddKernel(kernel)); -} - void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, @@ -363,8 +164,7 @@ void AddBasicAggKernels(KernelInit init, void AddMinMaxKernels(KernelInit init, const std::vector>& types, - ScalarAggregateFunction* func, - SimdLevel::type simd_level = SimdLevel::NONE) { + ScalarAggregateFunction* func, SimdLevel::type simd_level) { for (const auto& ty : types) { // array[T] -> scalar[struct] auto out_ty = struct_({field("min", ty), field("max", ty)}); @@ -376,18 +176,42 @@ void AddMinMaxKernels(KernelInit init, } // namespace aggregate namespace internal { +namespace { + +const FunctionDoc count_doc{"Count the number of null / non-null values", + ("By default, non-null values are counted.\n" + "This can be changed through CountOptions."), + {"array"}, + "CountOptions"}; + +const FunctionDoc sum_doc{ + "Sum values of a numeric array", ("Null values are ignored."), {"array"}}; + +const FunctionDoc mean_doc{"Compute the mean of a numeric array", + ("Null values are ignored. The result is always computed\n" + "as a double, regardless of the input types"), + {"array"}}; + +const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array", + ("Null values are ignored by default.\n" + "This can be changed through MinMaxOptions."), + {"array"}, + "MinMaxOptions"}; + +} // namespace + void RegisterScalarAggregateBasic(FunctionRegistry* registry) { static auto default_count_options = CountOptions::Defaults(); - auto func = std::make_shared("count", Arity::Unary(), - &default_count_options); + auto func = std::make_shared( + "count", Arity::Unary(), &count_doc, &default_count_options); // Takes any array input, outputs int64 scalar InputType any_array(ValueDescr::ARRAY); - aggregate::AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())), - aggregate::CountInit, func.get()); + AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())), + aggregate::CountInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("sum", Arity::Unary()); + func = std::make_shared("sum", Arity::Unary(), &sum_doc); aggregate::AddBasicAggKernels(aggregate::SumInit, {boolean()}, int64(), func.get()); aggregate::AddBasicAggKernels(aggregate::SumInit, SignedIntTypes(), int64(), func.get()); @@ -409,7 +233,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("mean", Arity::Unary()); + func = std::make_shared("mean", Arity::Unary(), &mean_doc); aggregate::AddBasicAggKernels(aggregate::MeanInit, {boolean()}, float64(), func.get()); aggregate::AddBasicAggKernels(aggregate::MeanInit, NumericTypes(), float64(), func.get()); @@ -428,12 +252,22 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { static auto default_minmax_options = MinMaxOptions::Defaults(); func = std::make_shared("min_max", Arity::Unary(), - &default_minmax_options); + &min_max_doc, &default_minmax_options); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get()); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get()); - DCHECK_OK(registry->AddFunction(std::move(func))); + // Add the SIMD variants for min max +#if defined(ARROW_HAVE_RUNTIME_AVX2) + if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { + aggregate::AddMinMaxAvx2AggKernels(func.get()); + } +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) + if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) { + aggregate::AddMinMaxAvx512AggKernels(func.get()); + } +#endif - DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); + DCHECK_OK(registry->AddFunction(std::move(func))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_sum_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc similarity index 80% rename from cpp/src/arrow/compute/kernels/aggregate_sum_avx2.cc rename to cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc index 2811c4cd865..e0c1118c714 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_sum_avx2.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc @@ -67,6 +67,17 @@ std::unique_ptr MeanInitAvx2(KernelContext* ctx, return visitor.Create(); } +// ---------------------------------------------------------------------- +// MinMax implementation + +std::unique_ptr MinMaxInitAvx2(KernelContext* ctx, + const KernelInitArgs& args) { + MinMaxInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options)); + return visitor.Create(); +} + void AddSumAvx2AggKernels(ScalarAggregateFunction* func) { AddBasicAggKernels(SumInitAvx2, internal::SignedIntTypes(), int64(), func, SimdLevel::AVX2); @@ -81,6 +92,12 @@ void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) { SimdLevel::AVX2); } +void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func) { + // Enable int types for AVX2 variants. + // No auto vectorize for float/double as it use fmin/fmax which has NaN handling. + AddMinMaxKernels(MinMaxInitAvx2, internal::IntTypes(), func, SimdLevel::AVX2); +} + } // namespace aggregate } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_sum_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc similarity index 80% rename from cpp/src/arrow/compute/kernels/aggregate_sum_avx512.cc rename to cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc index 00408027e1f..c2c748d3af7 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_sum_avx512.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc @@ -68,6 +68,17 @@ std::unique_ptr MeanInitAvx512(KernelContext* ctx, return visitor.Create(); } +// ---------------------------------------------------------------------- +// MinMax implementation + +std::unique_ptr MinMaxInitAvx512(KernelContext* ctx, + const KernelInitArgs& args) { + MinMaxInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options)); + return visitor.Create(); +} + void AddSumAvx512AggKernels(ScalarAggregateFunction* func) { AddBasicAggKernels(SumInitAvx512, internal::SignedIntTypes(), int64(), func, SimdLevel::AVX512); @@ -82,6 +93,12 @@ void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) { SimdLevel::AVX512); } +void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func) { + // Enable 32/64 int types for avx512 variants, no advantage on 8/16 int. + AddMinMaxKernels(MinMaxInitAvx512, {int32(), uint32(), int64(), uint64()}, func, + SimdLevel::AVX512); +} + } // namespace aggregate } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 29db97381d6..733e6d1d0a6 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -17,6 +17,9 @@ #pragma once +#include + +#include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/align_util.h" @@ -26,37 +29,32 @@ namespace arrow { namespace compute { namespace aggregate { -struct ScalarAggregator : public KernelState { - virtual void Consume(KernelContext* ctx, const ExecBatch& batch) = 0; - virtual void MergeFrom(KernelContext* ctx, const KernelState& src) = 0; - virtual void Finalize(KernelContext* ctx, Datum* out) = 0; -}; - -void AddAggKernel(std::shared_ptr sig, KernelInit init, - ScalarAggregateFunction* func, - SimdLevel::type simd_level = SimdLevel::NONE); - void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, SimdLevel::type simd_level = SimdLevel::NONE); +void AddMinMaxKernels(KernelInit init, + const std::vector>& types, + ScalarAggregateFunction* func, + SimdLevel::type simd_level = SimdLevel::NONE); + // SIMD variants for kernels void AddSumAvx2AggKernels(ScalarAggregateFunction* func); void AddMeanAvx2AggKernels(ScalarAggregateFunction* func); +void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func); void AddSumAvx512AggKernels(ScalarAggregateFunction* func); void AddMeanAvx512AggKernels(ScalarAggregateFunction* func); - -std::shared_ptr AddModeAggKernels(); +void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); // ---------------------------------------------------------------------- // Sum implementation -template +template struct SumState { using SumType = typename FindAccumulatorType::Type; - using ThisType = SumState; + using ThisType = SumState; using T = typename TypeTraits::CType; using ArrayType = typename TypeTraits::ArrayType; @@ -217,10 +215,10 @@ struct SumState { } }; -template -struct SumState { +template +struct SumState { using SumType = typename FindAccumulatorType::Type; - using ThisType = SumState; + using ThisType = SumState; ThisType& operator+=(const ThisType& rhs) { this->count += rhs.count; @@ -239,10 +237,10 @@ struct SumState { typename SumType::c_type sum = 0; }; -template +template struct SumImpl : public ScalarAggregator { using ArrayType = typename TypeTraits::ArrayType; - using ThisType = SumImpl; + using ThisType = SumImpl; using SumType = typename FindAccumulatorType::Type; using OutputType = typename TypeTraits::ScalarType; @@ -250,7 +248,7 @@ struct SumImpl : public ScalarAggregator { this->state.Consume(ArrayType(batch[0].array())); } - void MergeFrom(KernelContext*, const KernelState& src) override { + void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->state += other.state; } @@ -263,11 +261,11 @@ struct SumImpl : public ScalarAggregator { } } - SumState state; + SumState state; }; -template -struct MeanImpl : public SumImpl { +template +struct MeanImpl : public SumImpl { void Finalize(KernelContext*, Datum* out) override { const bool is_valid = this->state.count > 0; const double divisor = static_cast(is_valid ? this->state.count : 1UL); @@ -312,6 +310,268 @@ struct SumLikeInit { } }; +// ---------------------------------------------------------------------- +// MinMax implementation + +template +struct MinMaxState {}; + +template +struct MinMaxState> { + using ThisType = MinMaxState; + using T = typename ArrowType::c_type; + + ThisType& operator+=(const ThisType& rhs) { + this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; + this->min = this->min && rhs.min; + this->max = this->max || rhs.max; + return *this; + } + + void MergeOne(T value) { + this->min = this->min && value; + this->max = this->max || value; + } + + T min = true; + T max = false; + bool has_nulls = false; + bool has_values = false; +}; + +template +struct MinMaxState> { + using ThisType = MinMaxState; + using T = typename ArrowType::c_type; + + ThisType& operator+=(const ThisType& rhs) { + this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; + this->min = std::min(this->min, rhs.min); + this->max = std::max(this->max, rhs.max); + return *this; + } + + void MergeOne(T value) { + this->min = std::min(this->min, value); + this->max = std::max(this->max, value); + } + + T min = std::numeric_limits::max(); + T max = std::numeric_limits::min(); + bool has_nulls = false; + bool has_values = false; +}; + +template +struct MinMaxState> { + using ThisType = MinMaxState; + using T = typename ArrowType::c_type; + + ThisType& operator+=(const ThisType& rhs) { + this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; + this->min = std::fmin(this->min, rhs.min); + this->max = std::fmax(this->max, rhs.max); + return *this; + } + + void MergeOne(T value) { + this->min = std::fmin(this->min, value); + this->max = std::fmax(this->max, value); + } + + T min = std::numeric_limits::infinity(); + T max = -std::numeric_limits::infinity(); + bool has_nulls = false; + bool has_values = false; +}; + +template +struct MinMaxImpl : public ScalarAggregator { + using ArrayType = typename TypeTraits::ArrayType; + using ThisType = MinMaxImpl; + using StateType = MinMaxState; + + MinMaxImpl(const std::shared_ptr& out_type, const MinMaxOptions& options) + : out_type(out_type), options(options) {} + + void Consume(KernelContext*, const ExecBatch& batch) override { + StateType local; + + ArrayType arr(batch[0].array()); + + const auto null_count = arr.null_count(); + local.has_nulls = null_count > 0; + local.has_values = (arr.length() - null_count) > 0; + + if (local.has_nulls && options.null_handling == MinMaxOptions::EMIT_NULL) { + this->state = local; + return; + } + + if (local.has_nulls) { + local += ConsumeWithNulls(arr); + } else { // All true values + for (int64_t i = 0; i < arr.length(); i++) { + local.MergeOne(arr.Value(i)); + } + } + this->state = local; + } + + void MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->state += other.state; + } + + void Finalize(KernelContext*, Datum* out) override { + using ScalarType = typename TypeTraits::ScalarType; + + std::vector> values; + if (!state.has_values || + (state.has_nulls && options.null_handling == MinMaxOptions::EMIT_NULL)) { + // (null, null) + values = {std::make_shared(), std::make_shared()}; + } else { + values = {std::make_shared(state.min), + std::make_shared(state.max)}; + } + out->value = std::make_shared(std::move(values), this->out_type); + } + + std::shared_ptr out_type; + MinMaxOptions options; + MinMaxState state; + + private: + StateType ConsumeWithNulls(const ArrayType& arr) const { + StateType local; + const int64_t length = arr.length(); + int64_t offset = arr.offset(); + const uint8_t* bitmap = arr.null_bitmap_data(); + int64_t idx = 0; + + const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length); + // First handle the leading bits + const int64_t leading_bits = p.leading_bits; + while (idx < leading_bits) { + if (BitUtil::GetBit(bitmap, offset)) { + local.MergeOne(arr.Value(idx)); + } + idx++; + offset++; + } + + // The aligned parts scanned with BitBlockCounter + arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits); + auto current_block = data_counter.NextWord(); + while (idx < length) { + if (current_block.AllSet()) { // All true values + int run_length = 0; + // Scan forward until a block that has some false values (or the end) + while (current_block.length > 0 && current_block.AllSet()) { + run_length += current_block.length; + current_block = data_counter.NextWord(); + } + for (int64_t i = 0; i < run_length; i++) { + local.MergeOne(arr.Value(idx + i)); + } + idx += run_length; + offset += run_length; + // The current_block already computed, advance to next loop + continue; + } else if (!current_block.NoneSet()) { // Some values are null + BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length); + for (int64_t i = 0; i < current_block.length; i++) { + if (reader.IsSet()) { + local.MergeOne(arr.Value(idx + i)); + } + reader.Next(); + } + + idx += current_block.length; + offset += current_block.length; + } else { // All null values + idx += current_block.length; + offset += current_block.length; + } + current_block = data_counter.NextWord(); + } + + return local; + } +}; + +template +struct BooleanMinMaxImpl : public MinMaxImpl { + using StateType = MinMaxState; + using ArrayType = typename TypeTraits::ArrayType; + using MinMaxImpl::MinMaxImpl; + using MinMaxImpl::options; + + void Consume(KernelContext*, const ExecBatch& batch) override { + StateType local; + ArrayType arr(batch[0].array()); + + const auto arr_length = arr.length(); + const auto null_count = arr.null_count(); + const auto valid_count = arr_length - null_count; + + local.has_nulls = null_count > 0; + local.has_values = valid_count > 0; + if (local.has_nulls && options.null_handling == MinMaxOptions::EMIT_NULL) { + this->state = local; + return; + } + + const auto true_count = arr.true_count(); + const auto false_count = valid_count - true_count; + local.max = true_count > 0; + local.min = false_count == 0; + + this->state = local; + } +}; + +template +struct MinMaxInitState { + std::unique_ptr state; + KernelContext* ctx; + const DataType& in_type; + const std::shared_ptr& out_type; + const MinMaxOptions& options; + + MinMaxInitState(KernelContext* ctx, const DataType& in_type, + const std::shared_ptr& out_type, const MinMaxOptions& options) + : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} + + Status Visit(const DataType&) { + return Status::NotImplemented("No min/max implemented"); + } + + Status Visit(const HalfFloatType&) { + return Status::NotImplemented("No min/max implemented"); + } + + Status Visit(const BooleanType&) { + state.reset(new BooleanMinMaxImpl(out_type, options)); + return Status::OK(); + } + + template + enable_if_number Visit(const Type&) { + state.reset(new MinMaxImpl(out_type, options)); + return Status::OK(); + } + + std::unique_ptr Create() { + ctx->SetStatus(VisitTypeInline(in_type, this)); + return std::move(state); + } +}; + } // namespace aggregate } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index 882037f2d5d..5b95d7b526a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -19,7 +19,6 @@ #include -#include "arrow/builder.h" #include "arrow/compute/api.h" #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" @@ -301,6 +300,10 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) ->Apply(BenchmarkSetArgs); #endif // ARROW_WITH_BENCHMARKS_REFERENCE +// +// Sum +// + template static void SumKernel(benchmark::State& state) { using CType = typename TypeTraits::CType; @@ -330,6 +333,10 @@ SUM_KERNEL_BENCHMARK(SumKernelInt16, Int16Type); SUM_KERNEL_BENCHMARK(SumKernelInt32, Int32Type); SUM_KERNEL_BENCHMARK(SumKernelInt64, Int64Type); +// +// Mode +// + template void ModeKernelBench(benchmark::State& state) { using CType = typename TypeTraits::CType; @@ -369,6 +376,10 @@ MODE_KERNEL_BENCHMARK(ModeKernelInt16, Int16Type); MODE_KERNEL_BENCHMARK(ModeKernelInt32, Int32Type); MODE_KERNEL_BENCHMARK(ModeKernelInt64, Int64Type); +// +// MinMax +// + template static void MinMaxKernelBench(benchmark::State& state) { using CType = typename TypeTraits::CType; @@ -398,5 +409,53 @@ MINMAX_KERNEL_BENCHMARK(MinMaxKernelInt16, Int16Type); MINMAX_KERNEL_BENCHMARK(MinMaxKernelInt32, Int32Type); MINMAX_KERNEL_BENCHMARK(MinMaxKernelInt64, Int64Type); +// +// Count +// + +static void CountKernelBenchInt64(benchmark::State& state) { + RegressionArgs args(state); + const int64_t array_size = args.size / sizeof(int64_t); + auto rand = random::RandomArrayGenerator(1923); + auto array = rand.Numeric(array_size, -100, 100, args.null_proportion); + + for (auto _ : state) { + ABORT_NOT_OK(Count(array->Slice(1, array_size)).status()); + } +} +BENCHMARK(CountKernelBenchInt64)->Args({1 * 1024 * 1024, 2}); // 1M with 50% null. + +// +// Variance +// + +template +void VarianceKernelBench(benchmark::State& state) { + using CType = typename TypeTraits::CType; + + VarianceOptions options; + RegressionArgs args(state); + const int64_t array_size = args.size / sizeof(CType); + auto rand = random::RandomArrayGenerator(1925); + auto array = rand.Numeric(array_size, -100000, 100000, args.null_proportion); + + for (auto _ : state) { + ABORT_NOT_OK(Variance(array, options).status()); + } +} + +static void VarianceKernelBenchArgs(benchmark::internal::Benchmark* bench) { + BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024}); +} + +#define VARIANCE_KERNEL_BENCHMARK(FuncName, Type) \ + static void FuncName(benchmark::State& state) { VarianceKernelBench(state); } \ + BENCHMARK(FuncName)->Apply(VarianceKernelBenchArgs) + +VARIANCE_KERNEL_BENCHMARK(VarianceKernelInt32, Int32Type); +VARIANCE_KERNEL_BENCHMARK(VarianceKernelInt64, Int64Type); +VARIANCE_KERNEL_BENCHMARK(VarianceKernelFloat, FloatType); +VARIANCE_KERNEL_BENCHMARK(VarianceKernelDouble, DoubleType); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index 5f2f50c0b06..cb67794d942 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -47,5 +47,15 @@ struct FindAccumulatorType> { using Type = DoubleType; }; +struct ScalarAggregator : public KernelState { + virtual void Consume(KernelContext* ctx, const ExecBatch& batch) = 0; + virtual void MergeFrom(KernelContext* ctx, KernelState&& src) = 0; + virtual void Finalize(KernelContext* ctx, Datum* out) = 0; +}; + +void AddAggKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateFunction* func, + SimdLevel::type simd_level = SimdLevel::NONE); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 7905c89f83f..6544df549e6 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -18,41 +18,155 @@ #include #include -#include "arrow/compute/kernels/aggregate_basic_internal.h" +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/common.h" namespace arrow { namespace compute { -namespace aggregate { +namespace internal { namespace { -template -struct ModeState { - using ThisType = ModeState; - using T = typename ArrowType::c_type; +// {value:count} map +template +using CounterMap = std::unordered_map; + +// map based counter for floating points +template +enable_if_t::value, CounterMap> CountValuesByMap( + const ArrayType& array, int64_t& nan_count) { + CounterMap value_counts_map; + + nan_count = 0; + if (array.length() > array.null_count()) { + VisitArrayDataInline( + *array.data(), + [&](CType value) { + if (std::isnan(value)) { + ++nan_count; + } else { + ++value_counts_map[value]; + } + }, + []() {}); + } - void MergeFrom(const ThisType& state) { - for (const auto& value_count : state.value_counts) { - auto value = value_count.first; - auto count = value_count.second; - this->value_counts[value] += count; + return value_counts_map; +} + +// map base counter for non floating points +template +enable_if_t::value, CounterMap> CountValuesByMap( + const ArrayType& array) { + CounterMap value_counts_map; + + if (array.length() > array.null_count()) { + VisitArrayDataInline( + *array.data(), [&](CType value) { ++value_counts_map[value]; }, []() {}); + } + + return value_counts_map; +} + +// vector based counter for bool/int8 or integers with small value range +template +CounterMap CountValuesByVector(const ArrayType& array, CType min, CType max) { + const int range = static_cast(max - min); + DCHECK(range >= 0 && range < 64 * 1024 * 1024); + + std::vector value_counts_vector(range + 1); + if (array.length() > array.null_count()) { + VisitArrayDataInline( + *array.data(), [&](CType value) { ++value_counts_vector[value - min]; }, []() {}); + } + + // Transfer value counts to a map to be consistent with other chunks + CounterMap value_counts_map(range + 1); + for (int i = 0; i <= range; ++i) { + CType value = static_cast(i + min); + int64_t count = value_counts_vector[i]; + if (count) { + value_counts_map[value] = count; } } - template - enable_if_t::value> MergeOne(T value) { - ++this->value_counts[value]; + return value_counts_map; +} + +// map or vector based counter for int16/32/64 per value range +template +CounterMap CountValuesByMapOrVector(const ArrayType& array) { + // see https://issues.apache.org/jira/browse/ARROW-9873 + static constexpr int kMinArraySize = 8192 / sizeof(CType); + static constexpr int kMaxValueRange = 16384; + + if ((array.length() - array.null_count()) >= kMinArraySize) { + CType min = std::numeric_limits::max(); + CType max = std::numeric_limits::min(); + + VisitArrayDataInline( + *array.data(), + [&](CType value) { + min = std::min(min, value); + max = std::max(max, value); + }, + []() {}); + + if (static_cast(max) - static_cast(min) <= kMaxValueRange) { + return CountValuesByVector(array, min, max); + } } + return CountValuesByMap(array); +} - template - enable_if_t::value> MergeOne(T value) { - if (!std::isnan(value)) { - ++this->value_counts[value]; +// bool, int8 +template +enable_if_t::value && sizeof(CType) == 1, CounterMap> +CountValues(const ArrayType& array, int64_t& nan_count) { + using Limits = std::numeric_limits; + nan_count = 0; + return CountValuesByVector(array, Limits::min(), Limits::max()); +} + +// int16/32/64 +template +enable_if_t::value && (sizeof(CType) > 1), CounterMap> +CountValues(const ArrayType& array, int64_t& nan_count) { + nan_count = 0; + return CountValuesByMapOrVector(array); +} + +// float/double +template +enable_if_t<(std::is_floating_point::value), CounterMap> // NOLINT format +CountValues(const ArrayType& array, int64_t& nan_count) { + nan_count = 0; + return CountValuesByMap(array, nan_count); +} + +template +struct ModeState { + using ThisType = ModeState; + using CType = typename ArrowType::c_type; + + void MergeFrom(ThisType&& state) { + if (this->value_counts.empty()) { + this->value_counts = std::move(state.value_counts); + } else { + for (const auto& value_count : state.value_counts) { + auto value = value_count.first; + auto count = value_count.second; + this->value_counts[value] += count; + } + } + if (is_floating_type::value) { + this->nan_count += state.nan_count; } } - std::pair Finalize() { - T mode = std::numeric_limits::min(); + std::pair Finalize() { + CType mode = std::numeric_limits::min(); int64_t count = 0; for (const auto& value_count : this->value_counts) { @@ -63,10 +177,15 @@ struct ModeState { mode = this_value; } } + if (is_floating_type::value && this->nan_count > count) { + count = this->nan_count; + mode = static_cast(NAN); + } return std::make_pair(mode, count); } - std::unordered_map value_counts{}; + int64_t nan_count = 0; // only make sense to floating types + CounterMap value_counts; }; template @@ -77,28 +196,13 @@ struct ModeImpl : public ScalarAggregator { explicit ModeImpl(const std::shared_ptr& out_type) : out_type(out_type) {} void Consume(KernelContext*, const ExecBatch& batch) override { - ModeState local_state; - ArrayType arr(batch[0].array()); - - if (arr.null_count() > 0) { - BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); - for (int64_t i = 0; i < arr.length(); i++) { - if (reader.IsSet()) { - local_state.MergeOne(arr.Value(i)); - } - reader.Next(); - } - } else { - for (int64_t i = 0; i < arr.length(); i++) { - local_state.MergeOne(arr.Value(i)); - } - } - this->state = std::move(local_state); + ArrayType array(batch[0].array()); + this->state.value_counts = CountValues(array, this->state.nan_count); } - void MergeFrom(KernelContext*, const KernelState& src) override { - const auto& other = checked_cast(src); - this->state.MergeFrom(other.state); + void MergeFrom(KernelContext*, KernelState&& src) override { + auto& other = checked_cast(src); + this->state.MergeFrom(std::move(other.state)); } void Finalize(KernelContext*, Datum* out) override { @@ -106,12 +210,13 @@ struct ModeImpl : public ScalarAggregator { using CountType = typename TypeTraits::ScalarType; std::vector> values; - if (this->state.value_counts.empty()) { + auto mode_count = this->state.Finalize(); + auto mode = mode_count.first; + auto count = mode_count.second; + if (count == 0) { values = {std::make_shared(), std::make_shared()}; } else { - auto mode_count = state.Finalize(); - values = {std::make_shared(mode_count.first), - std::make_shared(mode_count.second)}; + values = {std::make_shared(mode), std::make_shared(count)}; } out->value = std::make_shared(std::move(values), this->out_type); } @@ -165,15 +270,29 @@ void AddModeKernels(KernelInit init, const std::vector } } -} // namespace +const FunctionDoc mode_doc{ + "Calculate the modal (most common) value of a numeric array", + ("This function returns both mode and count as a struct scalar,\n" + "with type `struct`, where T is the input type.\n" + "If there is more than one such value, the smallest one is returned.\n" + "Nulls are ignored. If there are no non-null values in the array,\n" + "null is returned."), + {"array"}}; std::shared_ptr AddModeAggKernels() { - auto func = std::make_shared("mode", Arity::Unary()); + auto func = + std::make_shared("mode", Arity::Unary(), &mode_doc); AddModeKernels(ModeInit, {boolean()}, func.get()); - AddModeKernels(ModeInit, internal::NumericTypes(), func.get()); + AddModeKernels(ModeInit, NumericTypes(), func.get()); return func; } -} // namespace aggregate +} // namespace + +void RegisterScalarAggregateMode(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunction(AddModeAggKernels())); +} + +} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index e51579b7710..bcaa842fa7f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -512,7 +513,7 @@ TEST_F(TestBooleanMinMaxKernel, Basics) { this->AssertMinMaxIs(chunked_input2, false, false, options); this->AssertMinMaxIs(chunked_input3, false, true, options); - options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); + options = MinMaxOptions(MinMaxOptions::EMIT_NULL); this->AssertMinMaxIsNull("[]", options); this->AssertMinMaxIsNull("[null, null, null]", options); this->AssertMinMaxIsNull("[false, null, false]", options); @@ -542,7 +543,7 @@ TYPED_TEST(TestIntegerMinMaxKernel, Basics) { this->AssertMinMaxIs(chunked_input2, 1, 9, options); this->AssertMinMaxIs(chunked_input3, 1, 9, options); - options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); + options = MinMaxOptions(MinMaxOptions::EMIT_NULL); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); // output null this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); @@ -569,7 +570,7 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { this->AssertMinMaxIs(chunked_input2, 1, 9, options); this->AssertMinMaxIs(chunked_input3, 1, 9, options); - options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); + options = MinMaxOptions(MinMaxOptions::EMIT_NULL); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options); // output null @@ -594,18 +595,150 @@ TYPED_TEST(TestFloatingMinMaxKernel, DefaultOptions) { AssertDatumsEqual(explicit_defaults, no_options_provided); } +template +struct MinMaxResult { + using T = typename ArrowType::c_type; + + T min = 0; + T max = 0; + bool is_valid = false; +}; + +template +static enable_if_integer> NaiveMinMax( + const Array& array) { + using T = typename ArrowType::c_type; + using ArrayType = typename TypeTraits::ArrayType; + + MinMaxResult result; + + const auto& array_numeric = reinterpret_cast(array); + const auto values = array_numeric.raw_values(); + + if (array.length() <= array.null_count()) { // All null values + return result; + } + + T min = std::numeric_limits::max(); + T max = std::numeric_limits::min(); + if (array.null_count() != 0) { // Some values are null + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), + array.length()); + for (int64_t i = 0; i < array.length(); i++) { + if (reader.IsSet()) { + min = std::min(min, values[i]); + max = std::max(max, values[i]); + } + reader.Next(); + } + } else { // All true values + for (int64_t i = 0; i < array.length(); i++) { + min = std::min(min, values[i]); + max = std::max(max, values[i]); + } + } + + result.min = min; + result.max = max; + result.is_valid = true; + return result; +} + +template +static enable_if_floating_point> NaiveMinMax( + const Array& array) { + using T = typename ArrowType::c_type; + using ArrayType = typename TypeTraits::ArrayType; + + MinMaxResult result; + + const auto& array_numeric = reinterpret_cast(array); + const auto values = array_numeric.raw_values(); + + if (array.length() <= array.null_count()) { // All null values + return result; + } + + T min = std::numeric_limits::infinity(); + T max = -std::numeric_limits::infinity(); + if (array.null_count() != 0) { // Some values are null + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), + array.length()); + for (int64_t i = 0; i < array.length(); i++) { + if (reader.IsSet()) { + min = std::fmin(min, values[i]); + max = std::fmax(max, values[i]); + } + reader.Next(); + } + } else { // All true values + for (int64_t i = 0; i < array.length(); i++) { + min = std::fmin(min, values[i]); + max = std::fmax(max, values[i]); + } + } + + result.min = min; + result.max = max; + result.is_valid = true; + return result; +} + +template +void ValidateMinMax(const Array& array) { + using Traits = TypeTraits; + using ScalarType = typename Traits::ScalarType; + + ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array)); + const StructScalar& value = out.scalar_as(); + + auto expected = NaiveMinMax(array); + const auto& out_min = checked_cast(*value.value[0]); + const auto& out_max = checked_cast(*value.value[1]); + + if (expected.is_valid) { + ASSERT_TRUE(out_min.is_valid); + ASSERT_TRUE(out_max.is_valid); + ASSERT_EQ(expected.min, out_min.value); + ASSERT_EQ(expected.max, out_max.value); + } else { // All null values + ASSERT_FALSE(out_min.is_valid); + ASSERT_FALSE(out_max.is_valid); + } +} + +template +class TestRandomNumericMinMaxKernel : public ::testing::Test {}; + +TYPED_TEST_SUITE(TestRandomNumericMinMaxKernel, NumericArrowTypes); +TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) { + auto rand = random::RandomArrayGenerator(0x8afc055); + // Test size up to 1<<11 (2048). + for (size_t i = 3; i < 12; i += 2) { + for (auto null_probability : {0.0, 0.01, 0.1, 0.5, 0.99, 1.0}) { + int64_t base_length = (1UL << i) + 2; + auto array = rand.Numeric(base_length, 0, 100, null_probability); + for (auto length_adjust : {-2, -1, 0, 1, 2}) { + int64_t length = (1UL << i) + length_adjust; + ValidateMinMax(*array->Slice(0, length)); + } + } + } +} + // // Mode // -template +template class TestPrimitiveModeKernel : public ::testing::Test { + public: + using ArrowType = T; using Traits = TypeTraits; using c_type = typename ArrowType::c_type; using ModeType = typename Traits::ScalarType; using CountType = typename TypeTraits::ScalarType; - public: void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) { ASSERT_OK_AND_ASSIGN(Datum out, Mode(array)); const StructScalar& value = out.scalar_as(); @@ -623,6 +756,12 @@ class TestPrimitiveModeKernel : public ::testing::Test { AssertModeIs(array, expected_mode, expected_count); } + void AssertModeIs(const std::vector& json, c_type expected_mode, + int64_t expected_count) { + auto chunked = ChunkedArrayFromJSON(type_singleton(), json); + AssertModeIs(chunked, expected_mode, expected_count); + } + void AssertModeIsNull(const Datum& array) { ASSERT_OK_AND_ASSIGN(Datum out, Mode(array)); const StructScalar& value = out.scalar_as(); @@ -637,6 +776,32 @@ class TestPrimitiveModeKernel : public ::testing::Test { AssertModeIsNull(array); } + void AssertModeIsNull(const std::vector& json) { + auto chunked = ChunkedArrayFromJSON(type_singleton(), json); + AssertModeIsNull(chunked); + } + + void AssertModeIsNaN(const Datum& array, int64_t expected_count) { + ASSERT_OK_AND_ASSIGN(Datum out, Mode(array)); + const StructScalar& value = out.scalar_as(); + + const auto& out_mode = checked_cast(*value.value[0]); + ASSERT_NE(out_mode.value, out_mode.value); // NaN != NaN + + const auto& out_count = checked_cast(*value.value[1]); + ASSERT_EQ(expected_count, out_count.value); + } + + void AssertModeIsNaN(const std::string& json, int64_t expected_count) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertModeIsNaN(array, expected_count); + } + + void AssertModeIsNaN(const std::vector& json, int64_t expected_count) { + auto chunked = ChunkedArrayFromJSON(type_singleton(), json); + AssertModeIsNaN(chunked, expected_count); + } + std::shared_ptr type_singleton() { return Traits::type_singleton(); } }; @@ -648,6 +813,10 @@ class TestFloatingModeKernel : public TestPrimitiveModeKernel {}; class TestBooleanModeKernel : public TestPrimitiveModeKernel {}; +class TestInt8ModeKernelValueRange : public TestPrimitiveModeKernel {}; + +class TestInt32ModeKernel : public TestPrimitiveModeKernel {}; + TEST_F(TestBooleanModeKernel, Basics) { this->AssertModeIs("[false, false]", false, 2); this->AssertModeIs("[false, false, true, true, true]", true, 3); @@ -657,6 +826,10 @@ TEST_F(TestBooleanModeKernel, Basics) { this->AssertModeIs("[true, null, false, false, null, true, null, null, true]", true, 3); this->AssertModeIsNull("[null, null, null]"); this->AssertModeIsNull("[]"); + + this->AssertModeIs({"[true, false]", "[true, true]", "[false, false]"}, false, 3); + this->AssertModeIs({"[true, null]", "[]", "[null, false]"}, false, 1); + this->AssertModeIsNull({"[null, null]", "[]", "[null]"}); } TYPED_TEST_SUITE(TestIntegerModeKernel, IntegralArrowTypes); @@ -668,6 +841,10 @@ TYPED_TEST(TestIntegerModeKernel, Basics) { this->AssertModeIs("[null, null, 2, null, 1]", 1, 1); this->AssertModeIsNull("[null, null, null]"); this->AssertModeIsNull("[]"); + + this->AssertModeIs({"[5]", "[1, 1, 5]", "[5]"}, 5, 3); + this->AssertModeIs({"[5]", "[1, 1, 5]", "[5, 1]"}, 1, 3); + this->AssertModeIsNull({"[null, null]", "[]", "[null]"}); } TYPED_TEST_SUITE(TestFloatingModeKernel, RealArrowTypes); @@ -678,12 +855,356 @@ TYPED_TEST(TestFloatingModeKernel, Floats) { this->AssertModeIs("[Inf, -Inf, Inf, -Inf]", -INFINITY, 2); this->AssertModeIs("[null, null, 2, null, 1]", 1, 1); - this->AssertModeIs("[NaN, NaN, 1]", 1, 1); + this->AssertModeIs("[NaN, NaN, 1, null, 1]", 1, 2); + this->AssertModeIsNull("[null, null, null]"); - this->AssertModeIsNull("[NaN, NaN, null]"); - this->AssertModeIsNull("[NaN, NaN, NaN]"); this->AssertModeIsNull("[]"); + + this->AssertModeIsNaN("[NaN, NaN, 1]", 2); + this->AssertModeIsNaN("[NaN, NaN, null]", 2); + this->AssertModeIsNaN("[NaN, NaN, NaN]", 3); + + this->AssertModeIs({"[Inf, 100]", "[Inf, 100]", "[Inf]"}, INFINITY, 3); + this->AssertModeIsNull({"[null, null]", "[]", "[null]"}); + this->AssertModeIsNaN({"[NaN, 1]", "[NaN, 1]", "[NaN]"}, 3); +} + +TEST_F(TestInt8ModeKernelValueRange, Basics) { + this->AssertModeIs("[0, 127, -128, -128]", -128, 2); + this->AssertModeIs("[127, 127, 127]", 127, 3); +} + +template +struct ModeResult { + using T = typename ArrowType::c_type; + + T mode = std::numeric_limits::min(); + int64_t count = 0; +}; + +template +ModeResult NaiveMode(const Array& array) { + using ArrayType = typename TypeTraits::ArrayType; + using CTYPE = typename ArrowType::c_type; + + std::unordered_map value_counts; + + const auto& array_numeric = reinterpret_cast(array); + const auto values = array_numeric.raw_values(); + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + for (int64_t i = 0; i < array.length(); ++i) { + if (reader.IsSet()) { + ++value_counts[values[i]]; + } + reader.Next(); + } + + ModeResult result; + for (const auto& value_count : value_counts) { + auto value = value_count.first; + auto count = value_count.second; + if (count > result.count || (count == result.count && value < result.mode)) { + result.count = count; + result.mode = value; + } + } + + return result; +} + +template +void CheckModeWithRange(CTYPE range_min, CTYPE range_max) { + using ModeScalar = typename TypeTraits::ScalarType; + using CountScalar = typename TypeTraits::ScalarType; + + auto rand = random::RandomArrayGenerator(0x5487655); + // 32K items (>= counting mode cutoff) within range, 10% null + auto array = rand.Numeric(32 * 1024, range_min, range_max, 0.1); + + auto expected = NaiveMode(*array); + ASSERT_OK_AND_ASSIGN(Datum out, Mode(array)); + const StructScalar& value = out.scalar_as(); + + ASSERT_TRUE(value.is_valid); + const auto& out_mode = checked_cast(*value.value[0]); + const auto& out_count = checked_cast(*value.value[1]); + ASSERT_EQ(out_mode.value, expected.mode); + ASSERT_EQ(out_count.value, expected.count); +} + +TEST_F(TestInt32ModeKernel, SmallValueRange) { + // Small value range => should exercise counter-based Mode implementation + CheckModeWithRange(-100, 100); +} + +TEST_F(TestInt32ModeKernel, LargeValueRange) { + // Large value range => should exercise hashmap-based Mode implementation + CheckModeWithRange(-10000000, 10000000); +} + +// +// Variance/Stddev +// + +template +class TestPrimitiveVarStdKernel : public ::testing::Test { + public: + using Traits = TypeTraits; + using ScalarType = typename TypeTraits::ScalarType; + + void AssertVarStdIs(const Array& array, const VarianceOptions& options, + double expected_var) { + AssertVarStdIsInternal(array, options, expected_var); + } + + void AssertVarStdIs(const std::shared_ptr& array, + const VarianceOptions& options, double expected_var) { + AssertVarStdIsInternal(array, options, expected_var); + } + + void AssertVarStdIs(const std::string& json, const VarianceOptions& options, + double expected_var) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertVarStdIs(*array, options, expected_var); + } + + void AssertVarStdIs(const std::vector& json, + const VarianceOptions& options, double expected_var) { + auto chunked = ChunkedArrayFromJSON(type_singleton(), json); + AssertVarStdIs(chunked, options, expected_var); + } + + void AssertVarStdIsInvalid(const Array& array, const VarianceOptions& options) { + AssertVarStdIsInvalidInternal(array, options); + } + + void AssertVarStdIsInvalid(const std::shared_ptr& array, + const VarianceOptions& options) { + AssertVarStdIsInvalidInternal(array, options); + } + + void AssertVarStdIsInvalid(const std::string& json, const VarianceOptions& options) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertVarStdIsInvalid(*array, options); + } + + void AssertVarStdIsInvalid(const std::vector& json, + const VarianceOptions& options) { + auto array = ChunkedArrayFromJSON(type_singleton(), json); + AssertVarStdIsInvalid(array, options); + } + + std::shared_ptr type_singleton() { return Traits::type_singleton(); } + + private: + void AssertVarStdIsInternal(const Datum& array, const VarianceOptions& options, + double expected_var) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_TRUE(var->is_valid && std->is_valid); + ASSERT_DOUBLE_EQ(std->value * std->value, var->value); + ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP + } + + void AssertVarStdIsInvalidInternal(const Datum& array, const VarianceOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_FALSE(var->is_valid || std->is_valid); + } +}; + +template +class TestNumericVarStdKernel : public TestPrimitiveVarStdKernel {}; + +// Reference value from numpy.var +TYPED_TEST_SUITE(TestNumericVarStdKernel, NumericArrowTypes); +TYPED_TEST(TestNumericVarStdKernel, Basics) { + VarianceOptions options; // ddof = 0, population variance/stddev + + this->AssertVarStdIs("[100]", options, 0); + this->AssertVarStdIs("[1, 2, 3]", options, 0.6666666666666666); + this->AssertVarStdIs("[null, 1, 2, null, 3]", options, 0.6666666666666666); + + std::vector chunks; + chunks = {"[]", "[1]", "[2]", "[null]", "[3]"}; + this->AssertVarStdIs(chunks, options, 0.6666666666666666); + chunks = {"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}; + this->AssertVarStdIs(chunks, options, 5.25); + chunks = {"[1, 2, 3, 4, 5, 6, 7]", "[8]"}; + this->AssertVarStdIs(chunks, options, 5.25); + + this->AssertVarStdIsInvalid("[null, null, null]", options); + this->AssertVarStdIsInvalid("[]", options); + this->AssertVarStdIsInvalid("[]", options); + + options.ddof = 1; // sample variance/stddev + + this->AssertVarStdIs("[1, 2]", options, 0.5); + + chunks = {"[1]", "[2]"}; + this->AssertVarStdIs(chunks, options, 0.5); + chunks = {"[1, 2, 3]", "[4, 5, 6]", "[7, 8]"}; + this->AssertVarStdIs(chunks, options, 6.0); + chunks = {"[1, 2, 3, 4, 5, 6, 7]", "[8]"}; + this->AssertVarStdIs(chunks, options, 6.0); + + this->AssertVarStdIsInvalid("[100]", options); + this->AssertVarStdIsInvalid("[100, null, null]", options); + chunks = {"[100]", "[null]", "[]"}; + this->AssertVarStdIsInvalid(chunks, options); +} + +// Test numerical stability +template +class TestVarStdKernelStability : public TestPrimitiveVarStdKernel {}; + +typedef ::testing::Types + VarStdStabilityTypes; + +TYPED_TEST_SUITE(TestVarStdKernelStability, VarStdStabilityTypes); +TYPED_TEST(TestVarStdKernelStability, Basics) { + VarianceOptions options{1}; // ddof = 1 + this->AssertVarStdIs("[100000004, 100000007, 100000013, 100000016]", options, 30.0); + this->AssertVarStdIs("[1000000004, 1000000007, 1000000013, 1000000016]", options, 30.0); + if (!is_unsigned_integer_type::value) { + this->AssertVarStdIs("[-1000000016, -1000000013, -1000000007, -1000000004]", options, + 30.0); + } +} + +// Test numerical stability of variance merging code +class TestVarStdKernelMergeStability : public TestPrimitiveVarStdKernel {}; + +TEST_F(TestVarStdKernelMergeStability, Basics) { + VarianceOptions options{1}; // ddof = 1 + +#ifndef __MINGW32__ // MinGW has precision issues + // XXX: The reference value from numpy is actually wrong due to floating + // point limits. The correct result should equals variance(90, 0) = 4050. + std::vector chunks = {"[40000008000000490]", "[40000008000000400]"}; + this->AssertVarStdIs(chunks, options, 3904.0); +#endif +} + +// Test integer arithmetic code +class TestVarStdKernelInt32 : public TestPrimitiveVarStdKernel {}; + +TEST_F(TestVarStdKernelInt32, Basics) { + VarianceOptions options{1}; + this->AssertVarStdIs("[-2147483648, -2147483647, -2147483646]", options, 1.0); + this->AssertVarStdIs("[2147483645, 2147483646, 2147483647]", options, 1.0); + this->AssertVarStdIs("[-2147483648, -2147483648, 2147483647]", options, + 6.148914688373205e+18); +} + +class TestVarStdKernelUInt32 : public TestPrimitiveVarStdKernel {}; + +TEST_F(TestVarStdKernelUInt32, Basics) { + VarianceOptions options{1}; + this->AssertVarStdIs("[4294967293, 4294967294, 4294967295]", options, 1.0); + this->AssertVarStdIs("[0, 0, 4294967295]", options, 6.148914688373205e+18); +} + +// https://en.wikipedia.org/wiki/Kahan_summation_algorithm +void KahanSum(double& sum, double& adjust, double addend) { + double y = addend - adjust; + double t = sum + y; + adjust = (t - sum) - y; + sum = t; +} + +// Calculate reference variance with Welford's online algorithm + Kahan summation +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +// XXX: not stable for long array with very small `stddev / average` +template +std::pair WelfordVar(const ArrayType& array) { + const auto values = array.raw_values(); + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length()); + double count = 0, mean = 0, m2 = 0; + double mean_adjust = 0, m2_adjust = 0; + for (int64_t i = 0; i < array.length(); ++i) { + if (reader.IsSet()) { + ++count; + double delta = static_cast(values[i]) - mean; + KahanSum(mean, mean_adjust, delta / count); + double delta2 = static_cast(values[i]) - mean; + KahanSum(m2, m2_adjust, delta * delta2); + } + reader.Next(); + } + return std::make_pair(m2 / count, m2 / (count - 1)); +} + +// Test random chunked array +template +class TestVarStdKernelRandom : public TestPrimitiveVarStdKernel {}; + +typedef ::testing::Types + VarStdRandomTypes; + +TYPED_TEST_SUITE(TestVarStdKernelRandom, VarStdRandomTypes); +TYPED_TEST(TestVarStdKernelRandom, Basics) { + // Cut array into small chunks + constexpr int array_size = 5000; + constexpr int chunk_size_max = 50; + constexpr int chunk_count = array_size / chunk_size_max; + + std::shared_ptr array; + auto rand = random::RandomArrayGenerator(0x5487656); + if (is_floating_type::value) { + array = rand.Numeric(array_size, -10000.0, 100000.0, 0.1); + } else { + using CType = typename TypeParam::c_type; + constexpr CType min = std::numeric_limits::min(); + constexpr CType max = std::numeric_limits::max(); + array = rand.Numeric(array_size, min, max, 0.1); + } + auto chunk_size_array = rand.Numeric(chunk_count, 0, chunk_size_max); + const int* chunk_size = chunk_size_array->data()->GetValues(1); + int total_size = 0; + + ArrayVector array_vector; + for (int i = 0; i < chunk_count; ++i) { + array_vector.emplace_back(array->Slice(total_size, chunk_size[i])); + total_size += chunk_size[i]; + } + auto chunked = *ChunkedArray::Make(array_vector); + + double var_population, var_sample; + using ArrayType = typename TypeTraits::ArrayType; + auto typed_array = std::static_pointer_cast(array->Slice(0, total_size)); + std::tie(var_population, var_sample) = WelfordVar(*typed_array); + + this->AssertVarStdIs(chunked, VarianceOptions{0}, var_population); + this->AssertVarStdIs(chunked, VarianceOptions{1}, var_sample); +} + +// This test is too heavy to run in CI, should be checked manually +#if 0 +class TestVarStdKernelIntegerLength : public TestPrimitiveVarStdKernel {}; + +TEST_F(TestVarStdKernelIntegerLength, Basics) { + constexpr int32_t min = std::numeric_limits::min(); + constexpr int32_t max = std::numeric_limits::max(); + auto rand = random::RandomArrayGenerator(0x5487657); + // large data volume + auto array = rand.Numeric(4000000000, min, max, 0.1); + // biased distribution + // auto array = rand.Numeric(4000000000, min, min + 100000, 0.1); + + double var_population, var_sample; + auto int32_array = std::static_pointer_cast(array); + std::tie(var_population, var_sample) = WelfordVar(*int32_array); + + this->AssertVarStdIs(*array, VarianceOptions{0}, var_population); + this->AssertVarStdIs(*array, VarianceOptions{1}, var_sample); } +#endif } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc new file mode 100644 index 00000000000..4dac0a37734 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/int128_internal.h" + +namespace arrow { +namespace compute { +namespace internal { + +namespace { + +using arrow::internal::int128_t; + +template +struct VarStdState { + using ArrayType = typename TypeTraits::ArrayType; + using CType = typename ArrowType::c_type; + using ThisType = VarStdState; + + // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm` + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm + template + enable_if_t::value || (sizeof(CType) > 4)> Consume( + const ArrayType& array) { + int64_t count = array.length() - array.null_count(); + if (count == 0) { + return; + } + + using SumType = + typename std::conditional::value, double, int128_t>::type; + SumType sum = 0; + VisitArrayDataInline( + *array.data(), [&sum](CType value) { sum += static_cast(value); }, + []() {}); + + double mean = static_cast(sum) / count, m2 = 0; + VisitArrayDataInline( + *array.data(), + [mean, &m2](CType value) { + double v = static_cast(value); + m2 += (v - mean) * (v - mean); + }, + []() {}); + + this->count = count; + this->mean = mean; + this->m2 = m2; + } + + // int32/16/8: textbook one pass algorithm with integer arithmetic + template + enable_if_t::value && (sizeof(CType) <= 4)> Consume( + const ArrayType& array) { + // max number of elements that sum will not overflow int64 (2Gi int32 elements) + // for uint32: 0 <= sum < 2^63 (int64 >= 0) + // for int32: -2^62 <= sum < 2^62 + constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8); + + int64_t start_index = 0; + int64_t valid_count = array.length() - array.null_count(); + + while (valid_count > 0) { + // process in chunks that overflow will never happen + const auto slice = array.Slice(start_index, max_length); + const int64_t count = slice->length() - slice->null_count(); + start_index += max_length; + valid_count -= count; + + if (count > 0) { + int64_t sum = 0; + int128_t square_sum = 0; + VisitArrayDataInline( + *slice->data(), + [&sum, &square_sum](CType value) { + sum += value; + square_sum += static_cast(value) * value; + }, + []() {}); + + const double mean = static_cast(sum) / count; + // calculate m2 = square_sum - sum * sum / count + // decompose `sum * sum / count` into integers and fractions + const int128_t sum_square = static_cast(sum) * sum; + const int128_t integers = sum_square / count; + const double fractions = static_cast(sum_square % count) / count; + const double m2 = static_cast(square_sum - integers) - fractions; + + // merge variance + ThisType state; + state.count = count; + state.mean = mean; + state.m2 = m2; + this->MergeFrom(state); + } + } + } + + // Combine `m2` from two chunks (m2 = n*s2) + // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html + void MergeFrom(const ThisType& state) { + if (state.count == 0) { + return; + } + if (this->count == 0) { + this->count = state.count; + this->mean = state.mean; + this->m2 = state.m2; + return; + } + double mean = (this->mean * this->count + state.mean * state.count) / + (this->count + state.count); + this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) + + state.count * (state.mean - mean) * (state.mean - mean); + this->count += state.count; + this->mean = mean; + } + + int64_t count = 0; + double mean = 0; + double m2 = 0; // m2 = count*s2 = sum((X-mean)^2) +}; + +enum class VarOrStd : bool { Var, Std }; + +template +struct VarStdImpl : public ScalarAggregator { + using ThisType = VarStdImpl; + using ArrayType = typename TypeTraits::ArrayType; + + explicit VarStdImpl(const std::shared_ptr& out_type, + const VarianceOptions& options, VarOrStd return_type) + : out_type(out_type), options(options), return_type(return_type) {} + + void Consume(KernelContext*, const ExecBatch& batch) override { + ArrayType array(batch[0].array()); + this->state.Consume(array); + } + + void MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->state.MergeFrom(other.state); + } + + void Finalize(KernelContext*, Datum* out) override { + if (this->state.count <= options.ddof) { + out->value = std::make_shared(); + } else { + double var = this->state.m2 / (this->state.count - options.ddof); + out->value = + std::make_shared(return_type == VarOrStd::Var ? var : sqrt(var)); + } + } + + std::shared_ptr out_type; + VarStdState state; + VarianceOptions options; + VarOrStd return_type; +}; + +struct VarStdInitState { + std::unique_ptr state; + KernelContext* ctx; + const DataType& in_type; + const std::shared_ptr& out_type; + const VarianceOptions& options; + VarOrStd return_type; + + VarStdInitState(KernelContext* ctx, const DataType& in_type, + const std::shared_ptr& out_type, + const VarianceOptions& options, VarOrStd return_type) + : ctx(ctx), + in_type(in_type), + out_type(out_type), + options(options), + return_type(return_type) {} + + Status Visit(const DataType&) { + return Status::NotImplemented("No variance/stddev implemented"); + } + + Status Visit(const HalfFloatType&) { + return Status::NotImplemented("No variance/stddev implemented"); + } + + template + enable_if_t::value, Status> Visit(const Type&) { + state.reset(new VarStdImpl(out_type, options, return_type)); + return Status::OK(); + } + + std::unique_ptr Create() { + ctx->SetStatus(VisitTypeInline(in_type, this)); + return std::move(state); + } +}; + +std::unique_ptr StddevInit(KernelContext* ctx, const KernelInitArgs& args) { + VarStdInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options), VarOrStd::Std); + return visitor.Create(); +} + +std::unique_ptr VarianceInit(KernelContext* ctx, + const KernelInitArgs& args) { + VarStdInitState visitor( + ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + static_cast(*args.options), VarOrStd::Var); + return visitor.Create(); +} + +void AddVarStdKernels(KernelInit init, + const std::vector>& types, + ScalarAggregateFunction* func) { + for (const auto& ty : types) { + auto sig = KernelSignature::Make({InputType::Array(ty)}, float64()); + AddAggKernel(std::move(sig), init, func); + } +} + +const FunctionDoc stddev_doc{ + "Calculate the standard deviation of a numeric array", + ("The number of degrees of freedom can be controlled using VarianceOptions.\n" + "By default (`ddof` = 0), the population standard deviation is calculated.\n" + "Nulls are ignored. If there are not enough non-null values in the array\n" + "to satisfy `ddof`, null is returned."), + {"array"}, + "VarianceOptions"}; + +const FunctionDoc variance_doc{ + "Calculate the variance of a numeric array", + ("The number of degrees of freedom can be controlled using VarianceOptions.\n" + "By default (`ddof` = 0), the population variance is calculated.\n" + "Nulls are ignored. If there are not enough non-null values in the array\n" + "to satisfy `ddof`, null is returned."), + {"array"}, + "VarianceOptions"}; + +std::shared_ptr AddStddevAggKernels() { + static auto default_std_options = VarianceOptions::Defaults(); + auto func = std::make_shared( + "stddev", Arity::Unary(), &stddev_doc, &default_std_options); + AddVarStdKernels(StddevInit, NumericTypes(), func.get()); + return func; +} + +std::shared_ptr AddVarianceAggKernels() { + static auto default_var_options = VarianceOptions::Defaults(); + auto func = std::make_shared( + "variance", Arity::Unary(), &variance_doc, &default_var_options); + AddVarStdKernels(VarianceInit, NumericTypes(), func.get()); + return func; +} + +} // namespace + +void RegisterScalarAggregateVariance(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunction(AddVarianceAggKernels())); + DCHECK_OK(registry->AddFunction(AddStddevAggKernels())); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index a4e11fe3894..1f940249857 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -728,7 +728,6 @@ struct ScalarBinary { } } else { if (batch[1].kind() == Datum::ARRAY) { - // e.g. if we were doing scalar < array, we flip and do array >= scalar return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out); } else { return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out); @@ -842,6 +841,8 @@ struct ScalarBinaryNotNull { template using ScalarBinaryEqualTypes = ScalarBinary; +// A kernel exec generator for non-null binary kernels where both input types are the +// same template using ScalarBinaryNotNullEqualTypes = ScalarBinaryNotNull; diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 299f652dc3d..fc18da7cf13 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -22,6 +22,7 @@ namespace arrow { using internal::AddWithOverflow; +using internal::DivideWithOverflow; using internal::MultiplyWithOverflow; using internal::SubtractWithOverflow; @@ -80,7 +81,7 @@ struct AddChecked { template enable_if_integer Call(KernelContext* ctx, Arg0 left, Arg1 right) { static_assert(std::is_same::value && std::is_same::value, ""); - T result; + T result = 0; if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } @@ -115,7 +116,7 @@ struct SubtractChecked { template enable_if_integer Call(KernelContext* ctx, Arg0 left, Arg1 right) { static_assert(std::is_same::value && std::is_same::value, ""); - T result; + T result = 0; if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } @@ -172,7 +173,7 @@ struct MultiplyChecked { template enable_if_integer Call(KernelContext* ctx, Arg0 left, Arg1 right) { static_assert(std::is_same::value && std::is_same::value, ""); - T result; + T result = 0; if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) { ctx->SetStatus(Status::Invalid("overflow")); } @@ -186,6 +187,52 @@ struct MultiplyChecked { } }; +struct Divide { + template + static enable_if_floating_point Call(KernelContext* ctx, Arg0 left, Arg1 right) { + return left / right; + } + + template + static enable_if_integer Call(KernelContext* ctx, Arg0 left, Arg1 right) { + T result; + if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) { + if (right == 0) { + ctx->SetStatus(Status::Invalid("divide by zero")); + } else { + result = 0; + } + } + return result; + } +}; + +struct DivideChecked { + template + static enable_if_integer Call(KernelContext* ctx, Arg0 left, Arg1 right) { + static_assert(std::is_same::value && std::is_same::value, ""); + T result; + if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) { + if (right == 0) { + ctx->SetStatus(Status::Invalid("divide by zero")); + } else { + ctx->SetStatus(Status::Invalid("overflow")); + } + } + return result; + } + + template + static enable_if_floating_point Call(KernelContext* ctx, Arg0 left, Arg1 right) { + static_assert(std::is_same::value && std::is_same::value, ""); + if (ARROW_PREDICT_FALSE(right == 0)) { + ctx->SetStatus(Status::Invalid("divide by zero")); + return 0; + } + return left / right; + } +}; + // Generate a kernel given an arithmetic functor template